Compare commits

..

25 Commits

Author SHA1 Message Date
Georgi Gerganov
65202d2985 sync : ggml 2025-05-01 09:59:02 +03:00
Diego Devesa
db1ff5b63a ggml : fix ggml_gallocr_ptr type (ggml/1205) 2025-05-01 09:32:45 +03:00
Georgi Gerganov
610df4cc3b cuda : fix unused variable compile warning (whisper/0)
ggml-ci
2025-05-01 09:32:42 +03:00
Johannes Gäßler
e1e8e0991f CUDA: batched+noncont MMQ, refactor bs>1 MoE code (#13199) 2025-04-30 23:12:59 +02:00
Xuan-Son Nguyen
6f67cf1f48 arg : -hf do not fail if url mismatch (#13219)
* arg : -hf do not fail if url mismatch

* do not return if cannot parse metadata json
2025-04-30 21:29:15 +01:00
ddh0
16a457facd fix typo: n_ctx_pre_seq -> n_ctx_per_seq (#13221) 2025-04-30 21:28:43 +01:00
Xuan-Son Nguyen
3e168bede4 convert : improve model arch handling (#13122)
* convert : improve model arch handling

* use AutoConfig

* rm trust_remote_code

* Update convert_hf_to_gguf.py

* fix self.block_count for vision

* fix NomicBertModel
2025-04-30 16:56:24 +02:00
Tatsuya Tanaka
ceda28ef8e llava : remove duplicate include (#13207) 2025-04-30 15:25:20 +02:00
Olivier Chafik
3b127c7385 common : add -jf / --json-schema-file flag (#12011) 2025-04-30 14:52:35 +02:00
Jeff Bolz
e5007a5edf vulkan: use uint array index to avoid glslang bug (#13193) 2025-04-30 14:38:37 +02:00
shalinib-ibm
416313773b ggml : fix ppc64le build (#13176)
Build fails with compilation error on power pc.
This patch fixes the same.

Tested with unit tests run via
 --build <build_dir> && cd <build_dir> && make test

Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
2025-04-30 13:17:08 +02:00
Xuan-Son Nguyen
07c2e2f76c convert : correct typo image_mean --> image_std (#13208) 2025-04-30 13:06:15 +02:00
Aaron Teo
44cd8d91ff feat(ggml-cpu): enable z17 compile (#13182)
z17 compilation requires GCC 15.1.0 and onwards

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-04-30 10:47:35 +01:00
Xuan-Son Nguyen
5933e6fdc9 arg : allow using -hf offline (#13202)
* arg : allow using -hf offline

* add more comments in code [no ci]
2025-04-30 10:46:32 +02:00
Xuan-Son Nguyen
da84c04d8f docker : do not build tests (#13204)
* docker : do not build tests

* include "ggml-cpu.h"
2025-04-30 10:44:07 +02:00
xiaofei
a0f7016d17 rpc : fix cache directory initialization (#13188)
Signed-off-by: xiaofei <hbuxiaofei@gmail.com>
2025-04-30 09:29:22 +03:00
Johannes Gäßler
19e899ce21 scripts: n_depth for compare-llama-bench [no ci] (#13201) 2025-04-29 23:32:04 +02:00
matteo
e2e1ddb93a server : Prefilling assistant message in openai compatible API (#13174)
* Prefilling assistant message in openai compatible API

* fixed indentation

* fixed code convention

* simplify method usage

* no more than one assistant message at end of messages

* merge checks into prefill code

* Update examples/server/utils.hpp

---------

Co-authored-by: matteo <matteo@naspc.lan>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-04-29 20:33:10 +02:00
Georgi Gerganov
d9d398f84f sampling : when top-k <= 0 -> noop (#13173)
ggml-ci
2025-04-29 20:22:57 +03:00
Alberto Cabrera Pérez
5a63980117 llama-bench: fixed size of fields to correctly map to values (#13183) 2025-04-29 17:24:36 +02:00
Johannes Gäßler
cdf76586b2 CUDA: fix non-cont. inputs for batched mat mul (#13155) 2025-04-29 16:00:27 +02:00
Sigbjørn Skjæret
7d3af70b08 llama : llm_type order by size (#13177) 2025-04-29 13:25:53 +02:00
Xuan-Son Nguyen
00e3e5a194 mtmd : add qwen2vl and qwen2.5vl (#13141)
* llava : add clip_n_output_tokens, deprecate clip_n_patches

* mtmd : add qwen2vl and qwen2.5vl

* decode_embd_batch::set_position_...

* working version

* deprecate llama-qwen2vl-cli

* correct order W, H of clip_embd_nbytes_by_img

* edit existing line in hot topics
2025-04-29 11:47:04 +02:00
Sigbjørn Skjæret
e98b3692be llama : set qwen3 model type sizes (#13175) 2025-04-29 11:00:31 +02:00
Xuan-Son Nguyen
b6ce7430b7 llama-graph : fix text position for mrope (#13159)
* llama-graph : fix text position for mrope

* fix typo

* explicitly set 4th dim in the loop
2025-04-29 09:45:49 +03:00
48 changed files with 1411 additions and 703 deletions

View File

@@ -14,9 +14,9 @@ WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \

View File

@@ -21,7 +21,7 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT

View File

@@ -35,7 +35,7 @@ COPY . .
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -40,7 +40,7 @@ WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \

View File

@@ -16,7 +16,7 @@ WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim

View File

@@ -43,6 +43,25 @@ std::initializer_list<enum llama_example> mmproj_examples = {
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
};
static std::string read_file(const std::string & fname) {
std::ifstream file(fname);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
}
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
file.close();
return content;
}
static void write_file(const std::string & fname, const std::string & content) {
std::ofstream file(fname);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
}
file << content;
file.close();
}
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples);
return *this;
@@ -200,9 +219,11 @@ struct curl_slist_ptr {
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
char * method = nullptr;
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_METHOD, &method);
while (remaining_attempts > 0) {
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
@@ -213,6 +234,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
if (remaining_attempts == 0) break;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
@@ -231,8 +253,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
return false;
}
bool force_download = false;
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
@@ -256,7 +276,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata;
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
std::string etag;
std::string last_modified;
@@ -266,14 +286,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
@@ -281,10 +294,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
}
}
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
@@ -296,7 +309,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
};
common_load_model_from_url_headers headers;
bool head_request_ok = false;
bool should_download = !file_exists; // by default, we should download if the file does not exist
// get ETag to see if the remote file has changed
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
@@ -325,23 +341,28 @@ static bool common_download_file_single(const std::string & url, const std::stri
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
// we only allow retrying once for HEAD requests
// this is for the use case of using running offline (no internet), retrying can be annoying
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0);
if (!was_perform_successful) {
return false;
head_request_ok = false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
if (http_code == 200) {
head_request_ok = true;
} else {
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
head_request_ok = false;
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
// if head_request_ok is false, we don't have the etag or last-modified headers
// we leave should_download as-is, which is true if the file does not exist
if (head_request_ok) {
// check if ETag or Last-Modified headers are different
// if it is, we need to download the file again
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
@@ -350,6 +371,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
@@ -424,13 +446,15 @@ static bool common_download_file_single(const std::string & url, const std::stri
{"etag", headers.etag},
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
write_file(metadata_path, metadata.dump(4));
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
} else {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
}
return true;
@@ -605,16 +629,37 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here
// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
string_replace_all(cached_response_fname, "/", "_");
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
// make the request
common_remote_params params;
params.headers = headers;
auto res = common_remote_get_content(url, params);
long res_code = res.first;
std::string res_str(res.second.data(), res.second.size());
long res_code = 0;
std::string res_str;
bool use_cache = false;
try {
auto res = common_remote_get_content(url, params);
res_code = res.first;
res_str = std::string(res.second.data(), res.second.size());
} catch (const std::exception & e) {
LOG_WRN("error: failed to get manifest: %s\n", e.what());
LOG_WRN("try reading from cache\n");
// try to read from cache
try {
res_str = read_file(cached_response_path);
res_code = 200;
use_cache = true;
} catch (const std::exception & e) {
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
}
}
std::string ggufFile;
std::string mmprojFile;
if (res_code == 200) {
if (res_code == 200 || res_code == 304) {
// extract ggufFile.rfilename in json, using regex
{
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
@@ -631,6 +676,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
mmprojFile = match[1].str();
}
}
if (!use_cache) {
// if not using cached response, update the cache file
write_file(cached_response_path, res_str);
}
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
@@ -1142,6 +1191,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
fprintf(stderr, "%s\n", ex.what());
ctx_arg.params = params_org;
return false;
} catch (std::exception & ex) {
fprintf(stderr, "%s\n", ex.what());
exit(1); // for other exceptions, we exit with status code 1
}
return true;
@@ -1442,13 +1494,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
params.prompt = read_file(value);
// store the external file name in params
params.prompt_file = value;
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (!params.prompt.empty() && params.prompt.back() == '\n') {
params.prompt.pop_back();
}
@@ -1458,11 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-sysf", "--system-prompt-file"}, "FNAME",
"a file containing the system prompt (default: none)",
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
params.system_prompt = read_file(value);
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
params.system_prompt.pop_back();
}
@@ -1887,15 +1931,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--grammar-file"}, "FNAME",
"file to read grammar from",
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.sampling.grammar)
);
params.sampling.grammar = read_file(value);
}
).set_sparam());
add_opt(common_arg(
@@ -1905,6 +1941,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
}
).set_sparam());
add_opt(common_arg(
{"-jf", "--json-schema-file"}, "FILE",
"File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::string schema;
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(schema)
);
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
}
).set_sparam());
add_opt(common_arg(
{"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
@@ -2815,14 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.chat_template));
params.chat_template = read_file(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(

View File

@@ -16,6 +16,7 @@ from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from itertools import chain
from transformers import AutoConfig
import math
import numpy as np
@@ -66,8 +67,6 @@ class ModelBase:
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
block_count: int
tensor_map: gguf.TensorNameMap
tensor_names: set[str] | None
gguf_writer: gguf.GGUFWriter
model_name: str | None
@@ -78,6 +77,10 @@ class ModelBase:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
# subclasses should initialize this!
block_count: int
tensor_map: gguf.TensorNameMap
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
@@ -113,8 +116,6 @@ class ModelBase:
if not self.is_safetensors:
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None
self.metadata_override = metadata_override
self.model_name = model_name
@@ -417,15 +418,13 @@ class ModelBase:
@staticmethod
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
architectures = hparams.get("architectures")
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
if architectures is not None:
# preserve "architectures" from root level config
hparams["architectures"] = architectures
return hparams
try:
return AutoConfig.from_pretrained(dir_model).to_dict()
except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -454,6 +453,23 @@ class ModelBase:
class TextModel(ModelBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if "text_config" in self.hparams:
# move the text_config to the root level
self.hparams = {**self.hparams, **self.hparams["text_config"]}
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@classmethod
def __init_subclass__(cls):
# can't use an abstract property, because overriding it without type errors
# would require using decorated functions instead of simply defining the property
if "model_arch" not in cls.__dict__:
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
def set_vocab(self):
self._set_vocab_gpt2()
@@ -1070,9 +1086,9 @@ class VisionModel(ModelBase):
if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
# small hack to correct the number of layers
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128)
self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"])
# get n_embd of the text model
text_config = {**self.hparams, **self.hparams["text_config"]}
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
assert self.n_embd_text > 0, "n_embd not found in hparams"
if "vision_config" not in self.hparams:
@@ -1081,6 +1097,9 @@ class VisionModel(ModelBase):
self.global_config = self.hparams
self.hparams = self.hparams["vision_config"]
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
# load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
@@ -1098,12 +1117,12 @@ class VisionModel(ModelBase):
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
self.gguf_writer.add_vision_block_count(self.block_count)
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
# preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
def write_vocab(self):
raise ValueError("VisionModel does not support vocab writing")
@@ -1719,23 +1738,12 @@ class StableLMModel(TextModel):
"LlamaForCausalLM",
"MistralForCausalLM",
"MixtralForCausalLM",
"Idefics3ForConditionalGeneration",
"SmolVLMForConditionalGeneration",
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing `num_attention_heads` in config.json
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
# fix for Pixtral, missing `num_attention_heads` in config.json
if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
and self.hparams.get("model_type") == "mistral":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
@@ -1898,11 +1906,7 @@ class LlavaVisionModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams["model_type"] == "pixtral":
# fix missing config.json values
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
self.img_break_tok_id = 12 # see tokenizer_config.json
else:
@@ -1913,7 +1917,6 @@ class LlavaVisionModel(VisionModel):
hparams = self.hparams
if hparams["model_type"] == "pixtral":
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
self.gguf_writer.add_vision_use_silu(True)
@@ -1944,13 +1947,12 @@ class LlavaVisionModel(VisionModel):
class SmolVLMModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing some keys in config.json
# default values are taken from transformers code
if self.hparams["model_type"] == "smolvlm_vision":
# fix for SmolVLM2, missing some keys in config.json
# default values are taken from transformers code
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -3505,6 +3507,8 @@ class RobertaModel(BertModel):
@ModelBase.register("NomicBertModel")
class NomicBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
hparams = kwargs.pop("hparams", None)
if hparams is None:
@@ -5849,6 +5853,19 @@ def split_str_to_n_bytes(split_str: str) -> int:
return n
def get_model_architecture(dir_model: Path, model_type: ModelType, hparams: Any = None) -> str:
hparams = ModelBase.load_hparams(dir_model) if hparams is None else hparams
text_config = hparams.get("text_config", {})
vision_config = hparams.get("vision_config", {})
arch = hparams["architectures"][0]
# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]
elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
arch = vision_config["architectures"][0]
return arch
def main() -> None:
args = parse_args()
@@ -5901,16 +5918,15 @@ def main() -> None:
logger.info(f"Loading model: {dir_model.name}")
hparams = ModelBase.load_hparams(dir_model)
if args.mmproj:
if "mmproj" not in fname_out.name:
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
with torch.inference_mode():
output_type = ftype_map[args.outtype]
model_architecture = hparams["architectures"][0]
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
model_architecture = get_model_architecture(dir_model, model_type)
logger.info(f"Model architecture: {model_architecture}")
try:
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
except NotImplementedError:

View File

@@ -1133,8 +1133,6 @@ struct test {
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
"embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",

View File

@@ -64,13 +64,7 @@ endif()
add_executable(llama-llava-cli deprecation-warning.cpp)
add_executable(llama-gemma3-cli deprecation-warning.cpp)
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
set(TARGET llama-qwen2vl-cli)
add_executable(${TARGET} qwen2vl-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)

View File

@@ -2,8 +2,6 @@
#include "gguf.h"
#include "clip.h"
#include "clip.h"
#include <climits>
#include <cstdarg>
#include <string>

View File

@@ -2825,15 +2825,18 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}
// deprecated
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
const int32_t nx = ctx->vision_model.hparams.image_size;
const int32_t ny = ctx->vision_model.hparams.image_size;
return clip_embd_nbytes_by_img(ctx, nx, ny);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2863,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size();
}
// deprecated
int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
return clip_n_output_tokens(ctx, &img);
}
// deprecated
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
return clip_n_output_tokens(ctx, img);
}
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
}
return 1;
}
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

View File

@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
"use clip_n_output_tokens instead");
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);

View File

@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
struct {
struct ggml_context * ctx;
} model;
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
LOG_ERR("Unable to encode image\n");
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {

View File

@@ -136,39 +136,6 @@ struct mtmd_cli_context {
}
};
struct decode_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
@@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
return 1;
}
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
ctx.n_past += mtmd_helper_get_n_pos(chunks);
return 0;
}
@@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
}
}
if (g_is_interrupted) LOG("\nInterrupted by user\n");
LOG("\n\n");
llama_perf_context_print(ctx.lctx);
return g_is_interrupted ? 130 : 0;
}

View File

@@ -40,11 +40,14 @@ struct mtmd_context {
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
// TODO @ngxson : add timings
mtmd_context(const char * mmproj_fname,
const llama_model * text_model,
const mtmd_context_params & ctx_params) :
text_model (text_model),
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
image_marker (ctx_params.image_marker)
@@ -56,9 +59,8 @@ struct mtmd_context {
if (!ctx_clip) {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
}
this->text_model = text_model;
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
use_mrope = clip_is_qwen2vl(ctx_clip);
int minicpmv_version = clip_is_minicpmv(ctx_clip);
if (minicpmv_version == 2) {
@@ -126,6 +128,7 @@ struct mtmd_image_tokens_data {
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -202,6 +205,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
@@ -226,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
@@ -322,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = n_tokens;
image_tokens->ny = 1; // TODO
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmaps[i_img].id; // optional
@@ -372,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
return image_tokens->id;
}
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
}
return image_tokens->n_tokens();
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -389,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx->ctx_clip,
ctx->n_threads,
@@ -417,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_tokens += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_tokens += chunk.tokens_image->n_tokens();
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
@@ -425,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
return n_tokens;
}
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
llama_pos n_pos = 0;
for (auto & chunk : chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_pos += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_pos;
}
// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
std::vector<llama_pos> pos;
std::vector<llama_pos> pos_view; // used by mrope
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
@@ -451,13 +492,64 @@ struct decode_embd_batch {
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
}
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.resize(n_tokens * n_pos_per_embd);
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
auto src = pos.begin() + i * batch.n_tokens + offset;
pos_view.insert(pos_view.end(), src, src + n_tokens);
}
pos_ptr = pos_view.data();
} else {
// normal
pos_ptr = pos.data() + offset;
}
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
/*logits =*/ batch.logits + offset,
};
}
};
int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -470,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
llama_pos n_past = pos0;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
for (auto & chunk : chunks) {
bool is_last = &chunk == &chunks.back();
@@ -517,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
@@ -524,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
}
while (i_batch < n_img_batches) { // split into batches
int32_t pos_offset = i_batch*n_batch;
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
float * embd_batch = embd + pos_offset*n_mmproj_embd;
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch);
ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
@@ -545,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
}
i_batch++;
n_past += n_tokens_batch;
}
// for mrope, one image is one single **temporal** position
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
@@ -595,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
return ctx->use_mrope;
}
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
mtmd_image_tokens_free(val);
}

View File

@@ -102,6 +102,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
// returns 0 on success
@@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// whether we need to set non-causal mask before llama_decode
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
// whether the current model use M-RoPE for llama_decode
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
//
// helper functions (can be implemented based on other functions)
//
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()

View File

@@ -27,6 +27,8 @@
#include <cassert>
#include <cmath>
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
// IT IS NOT A PRODUCTION CODE
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {

View File

@@ -54,8 +54,8 @@ add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
# to test the big models, run: ./tests.sh big
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"

View File

@@ -304,8 +304,9 @@ int main(int argc, char * argv[]) {
get_backend_memory(&free_mem, &total_mem);
}
const char * cache_dir = nullptr;
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
std::string cache_dir_str;
if (params.use_cache) {
cache_dir_str = fs_get_cache_directory() + "rpc/";
if (!fs_create_directory_with_parents(cache_dir_str)) {
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
return 1;

View File

@@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse(
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}
// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
common_chat_msg last_message;
if (prefill_assistant_message) {
last_message = inputs.messages.back();
inputs.messages.pop_back();
/* sanity check, max one assistant message at the end of the list */
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
}
inputs.extract_reasoning = false;
inputs.add_generation_prompt = true;
}
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(tmpls, inputs);
/* Append assistant prefilled message */
if (prefill_assistant_message) {
chat_params.prompt += last_message.content;
}
llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {

View File

@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend

View File

@@ -352,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
message(STATUS "z16 target")
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
list(APPEND ARCH_FLAGS -march=z16)
elseif (${S390X_M} MATCHES "9175|9176")
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17)
else()
message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")

View File

@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_ZERO {0.0f}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)

View File

@@ -1,6 +1,8 @@
#include "convert.cuh"
#include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
template <typename src_t, typename dst_t>
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
if (i00 >= ne00) {
return;
}
const int64_t i01 = blockIdx.y;
const int64_t i02 = blockIdx.z % ne02;
const int64_t i03 = blockIdx.z / ne02;
const src_t * x = (const src_t *) vx;
y[i] = float(x[i]);
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
y[iy] = float(x[ix]);
}
template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
static void convert_unary_cuda(const void * vx, dst_t * y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne02, s01, s02, s03);
}
template <typename src_t, typename dst_t>
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
}
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
return convert_unary_cont_cuda<float>;
case GGML_TYPE_F16:
return convert_unary_cuda<half>;
return convert_unary_cont_cuda<half>;
default:
return nullptr;
}
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
return convert_unary_cont_cuda<float>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>;
return convert_unary_cont_cuda<nv_bfloat16>;
default:
return nullptr;
}
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F16:
return convert_unary_cuda<half>;
return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
default:
return nullptr;
}
}
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>;
default:

View File

@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
// TODO more general support for non-contiguous inputs
template<typename T>
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);

View File

@@ -592,6 +592,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
}
#else
GGML_UNUSED(disable_indirection_for_this_node);
#endif
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));

View File

@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2 v;
dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = v.x;
dst_row[iybs + iqs + y_offset] = v.y;
dst_row[iybs + iqs + 0] = float(v.x);
dst_row[iybs + iqs + y_offset] = float(v.y);
}
template<typename src0_t, typename dst_t>
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00];
dst_row[i00] = float(src0_row[i00]);
}
template<typename grad_t, typename dst_t>
@@ -86,120 +86,159 @@ static __global__ void k_get_rows_back_float(
dst[dst_row*ncols + col] = sum;
}
template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
static void get_rows_cuda_q(
const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst);
// const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / sizeof(dst_t);
const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1);
const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
}
template<typename src0_t>
template<typename src0_t, typename dst_t>
static void get_rows_cuda_float(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ne13 == 1);
const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst);
// const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / sizeof(dst_t);
const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1);
const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
}
GGML_UNUSED(dst);
template <typename dst_t>
static void ggml_cuda_get_rows_switch_src0_type(
const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
switch (src0_type) {
case GGML_TYPE_F16:
get_rows_cuda_float((const half *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F32:
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q4_1:
get_rows_cuda_q<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q5_0:
get_rows_cuda_q<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q5_1:
get_rows_cuda_q<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q8_0:
get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
default:
// TODO: k-quants
GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
break;
}
}
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream) {
switch (dst_type) {
case GGML_TYPE_F32:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
default:
GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type));
break;
}
}
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const void * src0_d = (const void *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
switch (src0->type) {
case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
default:
// TODO: k-quants
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
break;
}
get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
}
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -3,6 +3,13 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream);
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1551,7 +1551,7 @@ static void ggml_cuda_op_mul_mat(
if (src1_on_device && src1_is_contiguous) {
quantize_src1(
dev[id].src1_ddf, dev[id].src1_ddq, src0->type, ne10,
dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
src1_padded_col_size, ne11, ne12, ne13, stream);
CUDA_CHECK(cudaGetLastError());
@@ -1649,7 +1649,7 @@ static void ggml_cuda_op_mul_mat(
if (quantize_src1 && !src1_is_contiguous) {
quantize_src1(
src1_ddf_i, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
src1_padded_col_size, src1_ncols, 1, 1, stream);
CUDA_CHECK(cudaGetLastError());
}
@@ -1720,15 +1720,15 @@ static __global__ void k_compute_batched_ptrs(
size_t nb12, size_t nb13,
size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3) {
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
int64_t i03 = i13 / r3;
int64_t i02 = i12 / r2;
const int64_t i03 = i13 / r3;
const int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1742,6 +1742,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
// As long as dst is contiguous this does not matter though.
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t ne_dst = ggml_nelements(dst);
@@ -1750,21 +1754,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
void * src0_ddq = src0->data;
half * src0_f16 = (half *) src0_ddq;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
const half * src0_f16 = (const half *) src0->data;
float * dst_ddf = (float *) dst->data;
const half * src1_f16 = (const half *) src1->data;
const size_t ts_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == ts_src1);
int64_t s11 = nb11 / ts_src1;
int64_t s12 = nb12 / ts_src1;
int64_t s13 = nb13 / ts_src1;
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
// convert src1 to fp16
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
GGML_ASSERT(to_fp16_cuda != nullptr);
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
src1_f16 = src1_f16_alloc.get();
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
}
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
char * dst_t;
@@ -1824,13 +1838,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
int i02 = i12 / r2;
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
@@ -1841,15 +1855,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
src1_f16, CUDA_R_16F, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// use cublasGemmBatchedEx
const int ne23 = ne12*ne13;
const int64_t ne23 = ne12*ne13;
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
@@ -1861,8 +1875,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
ne12, ne13,
ne23,
nb02, nb03,
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
nbd2, nbd3,
r2, r3);
CUDA_CHECK(cudaGetLastError());
@@ -1871,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
ne23,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1935,8 +1949,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_vec_q) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_q) {
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_mul_mat_vec) {
@@ -1950,183 +1966,145 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}
}
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
int64_t ne11, int64_t ne10,
size_t nb11, size_t nb12) {
int32_t iid1 = blockIdx.x;
int32_t id = blockIdx.y;
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
if (row_id_i != i02) {
return;
}
const int64_t i11 = id % ne11;
const int64_t i12 = iid1;
__shared__ int src1_row;
if (threadIdx.x == 0) {
src1_row = atomicAdd(cur_src1_row, 1);
row_mapping[src1_row] = {id, iid1};
}
__syncthreads();
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
src1_row_contiguous[i] = src1_row_original[i];
}
}
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
const mmid_row_mapping * __restrict__ row_mapping,
int64_t ne0,
size_t nb1, size_t nb2) {
int32_t i = blockIdx.x;
const int32_t i1 = row_mapping[i].i1;
const int32_t i2 = row_mapping[i].i2;
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
dst_row_original[j] = dst_row_contiguous[j];
}
}
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * ids = dst->src[2];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
GGML_TENSOR_BINARY_OP_LOCALS
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) {
if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
} else {
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
}
return;
}
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (ne2 == 1) {
if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
} else {
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
}
return;
}
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
return;
}
}
cudaStream_t stream = ctx.stream();
const int64_t n_as = ne02;
const int64_t n_ids = ids->ne[0];
GGML_ASSERT(nb12 % nb11 == 0);
GGML_ASSERT(nb2 % nb1 == 0);
const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
|| ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
const ggml_type type_dst_sorted = GGML_TYPE_F32;
const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
const int64_t n_expert_used = ids->ne[0];
const int64_t ne_get_rows = ne12 * n_expert_used;
std::vector<int32_t> ids_to_sorted_host;
ids_to_sorted_host.reserve(2*ne_get_rows);
std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
std::vector<int32_t> tokens_per_expert(ne02);
ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[3] = nb02;
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
src1_row.data = src1_contiguous.get();
dst_row.data = dst_contiguous.get();
for (int64_t i02 = 0; i02 < n_as; i02++) {
int64_t num_src1_rows = 0;
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
if (row_id_i != i02) {
continue;
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
assert(expert_to_use >= 0 && expert_to_use < ne02);
if (expert_to_use == i02) {
ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
tokens_per_expert[i02]++;
break;
}
num_src1_rows++;
}
}
}
GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
if (num_src1_rows == 0) {
ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end());
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
ne10, nb11, nb12, nb13,
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
CUDA_CHECK(cudaGetLastError());
char * src1_data_cur = (char *) src1_sorted.ptr;
char * dst_data_cur = (char *) dst_sorted.ptr;
for (int64_t i02 = 0; i02 < ne02; ++i02) {
if (tokens_per_expert[i02] == 0) {
continue;
}
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
ggml_tensor src0_slice = *src0;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.data = (char *) src0->data + i02*nb02;
{
dim3 block_dims(std::min((unsigned int)ne10, 768u));
dim3 grid_dims(ids->ne[1], n_ids);
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
src1_original, src1_contiguous.get(),
dev_cur_src1_row.get(), dev_row_mapping.get(),
ids_dev, i02, ids->nb[1], ids->nb[0],
ne11, ne10,
nb11, nb12);
CUDA_CHECK(cudaGetLastError());
}
ggml_tensor src1_slice;
memset(&src1_slice, 0, sizeof(src1_slice));
src1_slice.buffer = src1->buffer;
src1_slice.type = type_src1_sorted;
src1_slice.ne[0] = ne10;
src1_slice.ne[1] = tokens_per_expert[i02];
src1_slice.ne[2] = 1;
src1_slice.ne[3] = 1;
src1_slice.nb[0] = ts_src1_sorted;
src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
src1_slice.data = src1_data_cur;
src0_row.data = src0_original + i02*nb02;
ggml_tensor dst_slice;
memset(&dst_slice, 0, sizeof(dst_slice));
dst_slice.buffer = dst->buffer;
dst_slice.type = type_dst_sorted;
dst_slice.ne[0] = ne0;
dst_slice.ne[1] = tokens_per_expert[i02];
dst_slice.ne[2] = 1;
dst_slice.ne[3] = 1;
dst_slice.nb[0] = ts_dst_sorted;
dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
dst_slice.data = dst_data_cur;
GGML_ASSERT(nb11 == sizeof(float)*ne10);
GGML_ASSERT(nb1 == sizeof(float)*ne0);
ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
CUDA_CHECK(cudaGetLastError());
src1_row.ne[1] = num_src1_rows;
src1_row.nb[1] = nb11;
src1_row.nb[2] = num_src1_rows*nb11;
src1_row.nb[3] = num_src1_rows*nb11;
dst_row.ne[1] = num_src1_rows;
dst_row.nb[1] = nb1;
dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1;
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
{
dim3 block_dims(std::min((unsigned int)ne0, 768u));
dim3 grid_dims(num_src1_rows);
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
dst_original, dst_contiguous.get(),
dev_row_mapping.get(),
ne0,
nb1, nb2);
CUDA_CHECK(cudaGetLastError());
}
src1_data_cur += src1_slice.nb[2];
dst_data_cur += dst_slice.nb[2];
}
get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
nb1, nb2, nb3, stream);
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {

View File

@@ -1,37 +1,10 @@
#include "mmq.cuh"
#include "quantize.cuh"
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
#include <vector>
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) {
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
switch (args.type_x) {
case GGML_TYPE_Q4_0:
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
break;
@@ -90,10 +63,195 @@ void ggml_cuda_op_mul_mat_q(
GGML_ABORT("fatal error");
break;
}
}
void ggml_cuda_mul_mat_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
GGML_TENSOR_BINARY_OP_LOCALS;
cudaStream_t stream = ctx.stream();
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT( nb00 == ts_src0);
GGML_ASSERT( nb10 == ts_src1);
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
const char * src0_d = (const char *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t s1 = dst->nb[1] / ts_dst;
const int64_t s02 = src0->nb[2] / ts_src0;
const int64_t s2 = dst->nb[2] / ts_dst;
const int64_t s03 = src0->nb[3] / ts_src0;
const int64_t s3 = dst->nb[3] / ts_dst;
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
if (!ids) {
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
}
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
const int64_t s13 = ne12*s12;
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
ne00, ne01, ne1, s01, s1,
ne02, ne12, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
return;
}
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(nb12 % nb11 == 0);
GGML_ASSERT(nb2 % nb1 == 0);
const int64_t n_expert_used = ids->ne[0];
const int64_t ne_get_rows = ne12 * n_expert_used;
std::vector<char> ids_host(ggml_nbytes(ids));
std::vector<int32_t> ids_src1_host;
ids_src1_host.reserve(ne_get_rows);
std::vector<int32_t> ids_dst_host;
ids_dst_host.reserve(ne_get_rows);
std::vector<int32_t> tokens_per_expert_host(ne02);
std::vector<int32_t> expert_bounds_host(ne02 + 1);
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool());
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
assert(expert_to_use >= 0 && expert_to_use < ne02);
if (expert_to_use == i02) {
ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11);
ids_dst_host.push_back(i12*ne1 + iex);
tokens_per_expert_host[i02]++;
break;
}
}
}
}
int32_t cumsum = 0;
for (int64_t i = 0; i < ne02; ++i) {
expert_bounds_host[i] = cumsum;
cumsum += tokens_per_expert_host[i];
}
expert_bounds_host[ne02] = cumsum;
std::vector<int32_t> ids_buf_host;
ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size());
ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end());
ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end());
ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end());
ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device.
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
const int32_t * ids_src1_dev = ids_buf_dev.ptr;
const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size();
const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size();
const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 +
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
const int64_t ne11_flat = ne12*n_expert_used;
const int64_t ne12_flat = 1;
const int64_t ne13_flat = 1;
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[2] / ts_src1;
quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
}
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
const int64_t s13 = ne12*s12;
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
ne00, ne01, ne_get_rows, s01, s1,
ne02, ne02, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
}
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
ne00, row_diff, src1_ncols, stride01, nrows_dst,
1, 1, 0, 0, 0,
1, 1, 0, 0, 0,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(src1_padded_row_size);
}
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {

View File

@@ -13,9 +13,10 @@ using namespace ggml_cuda_mma;
#define MMQ_ITER_K 256
#define MMQ_NWARPS 8
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride);
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00);
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted,
float * __restrict__ dst, const int stride, const int i_max, const int j_max);
enum mmq_q8_1_ds_layout {
MMQ_Q8_1_DS_LAYOUT_D4,
@@ -233,7 +234,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */
// ------------------------------------------------------------
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -289,7 +290,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
const int * x_qs = (const int *) x;
@@ -328,7 +329,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -384,7 +385,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
const int * x_qs = (const int *) x;
@@ -423,7 +424,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -495,7 +496,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -565,7 +566,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -621,7 +622,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
const int * x_qs = (const int *) x;
@@ -651,7 +652,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
template <int mmq_x, int mmq_y, int nwarps, mmq_q8_1_ds_layout ds_layout>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
typedef tile<16, 8, int> tile_A;
typedef tile< 8, 8, int> tile_B;
@@ -732,7 +733,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y);
const int * x_qs = (const int *) x;
@@ -762,7 +763,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
typedef tile<16, 8, int> tile_A;
typedef tile< 8, 8, int> tile_B;
@@ -839,7 +840,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
const int * x_qs = (const int *) x;
@@ -871,7 +872,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#ifdef NEW_MMA_AVAILABLE
typedef tile<16, 4, int> tile_A;
@@ -955,7 +956,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1011,7 +1012,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y);
const int * x_qs = (const int *) x;
@@ -1074,7 +1075,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#ifdef NEW_MMA_AVAILABLE
typedef tile<16, 4, int> tile_A;
@@ -1201,7 +1202,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1298,7 +1299,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
const int * x_qs = (const int *) x;
@@ -1340,7 +1341,7 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1437,7 +1438,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
const int * x_qs = (const int *) x;
@@ -1469,7 +1470,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1578,7 +1579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
const int * x_qs = (const int *) x;
@@ -1610,7 +1611,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1693,7 +1694,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y);
const int * x_qs = (const int *) x;
@@ -1726,7 +1727,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#ifdef NEW_MMA_AVAILABLE
typedef tile<16, 4, int> tile_A;
@@ -1835,7 +1836,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_nl(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1893,7 +1894,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -1951,7 +1952,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xs(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2007,7 +2008,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_s(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2070,7 +2071,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_xxs(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2126,7 +2127,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_s(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2189,7 +2190,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_s(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2245,7 +2246,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_xs(
const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
#ifdef NEW_MMA_AVAILABLE
int * x_qs = (int *) x_tile;
@@ -2306,8 +2307,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
static __device__ __forceinline__ void mmq_write_back_dp4a(
const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst,
const int stride, const int i_max, const int j_max) {
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
@@ -2324,15 +2325,15 @@ static __device__ __forceinline__ void mmq_write_back_dp4a(
continue;
}
dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
}
}
}
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
static __device__ __forceinline__ void mmq_write_back_mma(
const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) {
const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst,
const int stride, const int i_max, const int j_max) {
typedef tile<16, 8, int> tile_C;
constexpr int granularity = mmq_get_granularity_device(mmq_x);
@@ -2362,7 +2363,7 @@ static __device__ __forceinline__ void mmq_write_back_mma(
continue;
}
dst[j*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l];
dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l];
}
}
}
@@ -2518,17 +2519,18 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_XS> {
};
template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
static __device__ void mul_mat_q_process_tile(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0,
const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) {
static __device__ __forceinline__ void mul_mat_q_process_tile(
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
extern __shared__ char data_mul_mat_q[];
int * tile_y = (int *) data_mul_mat_q;
extern __shared__ int data_mul_mat_q[];
int * tile_y = data_mul_mat_q + mmq_x;
int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE);
#ifdef NEW_MMA_AVAILABLE
@@ -2543,16 +2545,11 @@ static __device__ void mul_mat_q_process_tile(
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
const int tile_x_max_i = ne01 - it*mmq_y - 1;
const int tile_y_max_j = ne11 - jt*mmq_x - 1;
const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01);
load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
{
const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
#pragma unroll
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2568,7 +2565,7 @@ static __device__ void mul_mat_q_process_tile(
__syncthreads();
{
const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
#pragma unroll
for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2585,12 +2582,10 @@ static __device__ void mul_mat_q_process_tile(
}
if (fixup) {
write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x);
} else {
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j);
}
GGML_UNUSED(ne00); GGML_UNUSED(ne10);
}
@@ -2609,8 +2604,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
// Skip unused template specializations for faster compilation:
if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -2621,26 +2619,85 @@ static __global__ void mul_mat_q(
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
// For MoE the correct indices are loaded from ids_dst.
extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory.
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
break;
}
ids_dst_shared[j] = j;
}
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{
const int wt = blockIdx.z / nchannels_y;
const int zt = blockIdx.z - wt*nchannels_y;
const int jt = blockIdx.y;
const int it = blockIdx.x;
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
if (ids_dst) {
col_low = expert_bounds[zt + 0];
col_high = expert_bounds[zt + 1];
col_diff = col_high - col_low;
offset_y = 0;
offset_dst = 0;
if (jt*mmq_x >= col_diff) {
return;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
break;
}
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
offset_dst += it*mmq_y;
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
blockIdx.x, blockIdx.y, 0, ne00/qk);
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
const int64_t blocks_per_ne00 = ne00 / qk;
const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y
// kbc == k block continuous, current index in continuous ijk space.
int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x;
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x;
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
@@ -2649,13 +2706,64 @@ static __global__ void mul_mat_q(
int kb0_start = kbc % blocks_per_ne00;
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile.
const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile.
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
if (ids_dst) {
col_low = expert_bounds[zt + 0];
col_high = expert_bounds[zt + 1];
col_diff = col_high - col_low;
offset_y = 0;
offset_dst = 0;
if (jt*mmq_x >= col_diff) {
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
kb0_start = 0;
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
continue;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
break;
}
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
offset_dst += it*mmq_y;
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
it, jt, kb0_start, kb0_stop);
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
@@ -2668,55 +2776,106 @@ static __global__ void mul_mat_q(
return;
}
const int jt = kbc / (blocks_per_ne00*nty);
const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
if (ids_dst) {
col_low = expert_bounds[zt + 0];
col_high = expert_bounds[zt + 1];
col_diff = col_high - col_low;
offset_y = 0;
offset_dst = 0;
if (jt*mmq_x >= col_diff) {
return;
}
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) {
break;
}
ids_dst_shared[j] = j;
}
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
offset_dst += it*mmq_y;
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0,
it, jt, kb0_start, kb0_stop);
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
}
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static __global__ void mul_mat_q_stream_k_fixup(
float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) {
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
const int64_t blocks_per_ne00 = ne00 / qk;
const int64_t blocks_per_ne00 = ncols_x / qk;
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
const int ntx = (ne11 + mmq_x - 1) / mmq_x;
const int nty = (ne01 + mmq_y - 1) / mmq_y;
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int bidx0 = blockIdx.x;
// kbc == k block continuous, current index in continuous ijk space.
int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
const bool did_not_have_any_data = kbc0 == kbc0_stop;
const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
return;
}
bool any_fixup = false;
const int bidx_start = ((blockIdx.y*nty + blockIdx.x) * block_num_mmq) / (gridDim.y*gridDim.x);
const int bidx_stop = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x);
// Iterate over previous blocks and sum up partial sums written to fixup buffer.
// All CUDA blocks that get here must have a previous block that needs a fixup.
int64_t bidx = bidx0 - 1;
int64_t kbc_stop = kbc0;
while(true) {
int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
int64_t kbc_0;
int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq;
for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) {
kbc_0 = kbc_stop_0;
kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq;
const int64_t kbc = kbc_0 - (kbc_0 % blocks_per_ne00) % blocks_per_iter;
const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter;
// Skip fixup tile if the MMQ CUDA block never wrote anything to it:
if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) {
continue;
}
const int jt = kbc_stop / (blocks_per_ne00*nty);
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
if (kbc == kbc_stop) { // Did not have any data.
bidx--;
kbc_stop = kbc;
continue;
}
@@ -2733,16 +2892,71 @@ static __global__ void mul_mat_q_stream_k_fixup(
sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
}
}
// If this block started in a previous tile we are done and don't need to combine additional partial results.
if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
break;
}
bidx--;
kbc_stop = kbc;
}
if (!any_fixup) {
return;
}
dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y;
int tmp = kbc0;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
const int i_max = ne01 - blockIdx.x*mmq_y - 1;
const int j_max = ne11 - blockIdx.y*mmq_x - 1;
if (!ids_dst) {
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
dst += offset_dst;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_y - jt*mmq_x - 1;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j > j_max) {
return;
}
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (need_check && i > i_max) {
continue;
}
dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
}
}
return;
}
__shared__ int ids_dst_shared[mmq_x];
const int col_low = expert_bounds[zt + 0];
const int col_high = expert_bounds[zt + 1];
const int col_diff = col_high - col_low;
for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
ids_dst_shared[j] = ids_dst[col_low + j];
}
const int offset_dst = it*mmq_y;
dst += offset_dst;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = col_diff - jt*mmq_x - 1;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -2760,26 +2974,27 @@ static __global__ void mul_mat_q_stream_k_fixup(
continue;
}
dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
}
}
}
struct mmq_args {
const char * x; const char * y; float * dst;
int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne0;
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
bool use_stream_k;
};
template<ggml_type type>
static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) {
static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) {
const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
const int shmem_y = mmq_x*sizeof(block_q8_1_mmq);
return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
const size_t nbs_ids = mmq_x*sizeof(int);
const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int));
}
template <ggml_type type, int mmq_x>
@@ -2791,86 +3006,114 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1);
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc);
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shmem_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
shmem_limit_raised[id] = true;
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
const dim3 block_nums_xy_tiling(nty, ntx, 1);
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
const int ntzw = args.nchannels_y * args.nsamples_y;
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0);
GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0);
const int channel_ratio = args.nchannels_y / args.nchannels_x;
const int sample_ratio = args.nsamples_y / args.nsamples_x;
if (!args.use_stream_k) {
if (args.ne01 % mmq_y == 0) {
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
}
return;
}
const dim3 block_nums_mmq(nsm, 1, 1);
const dim3 block_nums_stream_k(nsm, 1, 1);
const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
if (fixup_needed) {
tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
}
if (args.ne01 % mmq_y == 0) {
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
(args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
if (!fixup_needed) {
return;
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_mmq, block_dims, shmem, stream>>>
(args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, 0, stream>>>
(args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x);
if (!fixup_needed) {
return;
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
}
}
template <ggml_type type>
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
const int smpbo = ggml_cuda_info().devices[id].smpbo;
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
const int mmq_x_max = get_mmq_x_max_host(cc);
const int mmq_y = get_mmq_y_host(cc);
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
int mmq_x_best = 0;
int nparts_best = INT_MAX;
int ntiles_x_best = INT_MAX;
for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) {
for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) {
const int granularity = mmq_get_granularity_host(mmq_x, cc);
if (mmq_x % granularity != 0 || mmq_get_shmem<type>(mmq_x, mmq_y, cc) > smpbo) {
if (mmq_x % granularity != 0 || mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc) > smpbo) {
continue;
}
const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x;
const int nwaves_xy_tiling = ntiles_x*block_num_y;
const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling;
const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x;
if (nparts < nparts_best) {
mmq_x_best = mmq_x;
nparts_best = nparts;
if (ntiles_x < ntiles_x_best) {
mmq_x_best = mmq_x;
ntiles_x_best = ntiles_x;
}
}
@@ -2954,6 +3197,9 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
// -------------------------------------------------------------------------------------------------------------------------
void ggml_cuda_mul_mat_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

View File

@@ -158,7 +158,7 @@ static __global__ void mul_mat_vec_q(
const int blocks_per_row_x = ncols_x / qk;
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
// The MUL_MAT_ID code path with ids != nullptr is only implemetned for ncols_dst == 1.
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
const int channel_dst = blockIdx.y;
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
@@ -507,7 +507,7 @@ void ggml_cuda_mul_mat_vec_q(
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
@@ -519,7 +519,7 @@ void ggml_cuda_mul_mat_vec_q(
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
quantize_row_q8_1_cuda(src1_d, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
}
const int64_t s01 = src0->nb[1] / ts_src0;

View File

@@ -49,29 +49,38 @@ static __global__ void quantize_q8_1(
template <mmq_q8_1_ds_layout ds_layout>
static __global__ void quantize_mmq_q8_1(
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int ne1, const int ne2) {
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
if (ix0 >= kx0_padded) {
if (i0 >= ne0) {
return;
}
const float4 * x4 = (const float4 *) x;
const int64_t i1 = blockIdx.y;
const int64_t i2 = blockIdx.z % ne2;
const int64_t i3 = blockIdx.z / ne2;
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
const int64_t i00 = i0;
const int64_t i01 = ids ? ids[i1] : i1;
const int64_t i02 = i2;
const int64_t i03 = i3;
const float4 * x4 = (const float4 *) x;
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel
const int64_t iqs = i0 % (4*QK8_1); // quant index in block
// Load 4 floats per thread and calculate max. abs. value between them:
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
float amax = fabsf(xi.x);
amax = fmaxf(amax, fabsf(xi.y));
amax = fmaxf(amax, fabsf(xi.z));
@@ -87,7 +96,7 @@ static __global__ void quantize_mmq_q8_1(
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
sum = xi.x + xi.y + xi.z + xi.w;
// Exchange calculate sum across vals_per_sum/4 threads.
// Calculate sums across vals_per_sum/4 threads.
#pragma unroll
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
@@ -137,9 +146,10 @@ static __global__ void quantize_mmq_q8_1(
}
void quantize_row_q8_1_cuda(
const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(!ids);
GGML_ASSERT(ne0 % QK8_1 == 0);
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
@@ -150,9 +160,9 @@ void quantize_row_q8_1_cuda(
}
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
@@ -161,21 +171,18 @@ void quantize_mmq_q8_1_cuda(
switch (mmq_get_q8_1_ds_layout(type_src0)) {
case MMQ_Q8_1_DS_LAYOUT_D4:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
break;
case MMQ_Q8_1_DS_LAYOUT_DS4:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
break;
case MMQ_Q8_1_DS_LAYOUT_D2S6:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
break;
default:
GGML_ABORT("fatal error");
break;
}
GGML_UNUSED(s01);
GGML_UNUSED(s02);
GGML_UNUSED(s03);
}

View File

@@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
typedef void (*quantize_cuda_t)(
const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
void quantize_row_q8_1_cuda(
const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);

View File

@@ -482,7 +482,7 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
const uint ib8 = (idx & 0x18) >> 3; // 0..3
const uint iqs = 8 * ib32 + ib8;
const uint8_t qs = bl.block.qs[iqs];
const uint qs = bl.block.qs[iqs];
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));

View File

@@ -1232,6 +1232,7 @@ extern "C" {
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751

View File

@@ -19,9 +19,9 @@ logger = logging.getLogger("compare-llama-bench")
# Properties by which to differentiate results per commit:
KEY_PROPERTIES = [
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
"embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
"split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
]
# Properties that are boolean and are converted to Yes/No for the table:
@@ -30,11 +30,11 @@ BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "fla
# Header names for the table:
PRETTY_NAMES = {
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
"model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
"embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
"no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
"cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
"use_mmap": "Use mmap", "no_kv_offload": "NKVO", "split_mode": "Split mode", "main_gpu": "Main GPU", "tensor_split": "Tensor split",
"flash_attn": "FlashAttention",
}
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
@@ -281,12 +281,12 @@ def get_rows(properties):
The returned rows are unique in terms of property combinations.
"""
select_string = ", ".join(
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
equal_string = " AND ".join(
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
)
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt"])
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
return cursor.execute(query).fetchall()
@@ -309,7 +309,7 @@ else:
rows_full = get_rows(KEY_PROPERTIES)
properties_different = []
for i, kp_i in enumerate(KEY_PROPERTIES):
if kp_i in DEFAULT_SHOW or kp_i == "n_prompt" or kp_i == "n_gen":
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
continue
for row_full in rows_full:
if row_full[i] != rows_full[0][i]:
@@ -340,17 +340,20 @@ else:
table = []
for row in rows_show:
n_prompt = int(row[-4])
n_gen = int(row[-3])
n_prompt = int(row[-5])
n_gen = int(row[-4])
n_depth = int(row[-3])
if n_prompt != 0 and n_gen == 0:
test_name = f"pp{n_prompt}"
elif n_prompt == 0 and n_gen != 0:
test_name = f"tg{n_gen}"
else:
test_name = f"pp{n_prompt}+tg{n_gen}"
if n_depth != 0:
test_name = f"{test_name}@d{n_depth}"
# Regular columns test name avg t/s values Speedup
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
# Some a-posteriori fixes to make the table contents prettier:
for bool_property in BOOL_PROPERTIES:
@@ -376,7 +379,7 @@ if "gpu_info" in show:
for gns in GPU_NAME_STRIP:
row_table[ip] = row_table[ip].replace(gns, "")
gpu_names = row_table[ip].split("/")
gpu_names = row_table[ip].split(", ")
num_gpus = len(gpu_names)
all_names_the_same = len(set(gpu_names)) == 1
if len(gpu_names) >= 2 and all_names_the_same:

View File

@@ -1 +1 @@
13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f
7734363ac74a14d7f2c8033c552d9d655c5dfc00

View File

@@ -114,7 +114,7 @@ llama_context::llama_context(
}
if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}

View File

@@ -55,13 +55,16 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
if (ubatch->token && n_pos_per_embd > 1) {
if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the other dimensions are all 0, they are unused for text tokens
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[i] = ubatch->pos[i];
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {

View File

@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B";
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B";
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B";
}
}
@@ -793,6 +797,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -802,6 +810,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -10185,7 +10195,6 @@ struct llm_build_deepseek2 : public llm_graph_context {
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
cb(q_nope_absorbed, "q_nope_absorbed", il);
// {kv_lora_rank, n_head, n_tokens}

View File

@@ -39,11 +39,13 @@ enum llm_type {
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_1B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
LLM_TYPE_1_6B,
LLM_TYPE_1_7B,
LLM_TYPE_1_8B,
LLM_TYPE_2B,
LLM_TYPE_2_8B,
@@ -62,6 +64,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -70,6 +73,7 @@ enum llm_type {
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
@@ -84,10 +88,10 @@ enum llm_type {
LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
LLM_TYPE_290B,
LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_30B_A3B,
LLM_TYPE_235B_A22B,
};
struct llama_layer_posnet {

View File

@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
if (k <= 0) {
k = cur_p->size;
return;
}
k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
cur_p->sorted = true;
}
cur_p->size = k;
}

View File

@@ -4184,6 +4184,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
// test cases with large ne00/ne10 to cover stream-k fixup
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, {3, 2}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 1024, {3, 2}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
}
}
for (ggml_type type_a : other_types) {

View File

@@ -1,4 +1,5 @@
#include "ggml.h"
#include "ggml-cpu.h"
#include "llama.h"
#include "common.h"