Compare commits

...

12 Commits

Author SHA1 Message Date
Sigbjørn Skjæret
d3a2eb592d disable on windows 2025-05-31 23:17:18 +02:00
Sigbjørn Skjæret
7210ebe230 revert build changes 2025-05-31 23:16:56 +02:00
Sigbjørn Skjæret
05f94a0e90 add arch to matrix 2025-05-31 22:54:37 +02:00
Sigbjørn Skjæret
f9a27178e5 download in batches 2025-05-31 22:35:26 +02:00
Sigbjørn Skjæret
de8ec1348b Merge branch 'master' into cisc/test-tokenizers-remote 2025-05-31 21:25:34 +02:00
Sigbjørn Skjæret
8e1125a8db copy curl dll for tests 2025-05-31 21:22:37 +02:00
Sigbjørn Skjæret
4b4843adf3 windows builds adds build type to runtime output 2025-05-30 11:51:46 +02:00
Sigbjørn Skjæret
d97b9ade51 correct working directory for all builds
..and change cache file name as per suggestion.
2025-05-28 12:49:36 +02:00
Sigbjørn Skjæret
0fe7183ae4 fix prototype for non-curl builds 2025-05-28 11:11:02 +02:00
Sigbjørn Skjæret
ecbc92acd0 correct working directory 2025-05-28 10:16:34 +02:00
Sigbjørn Skjæret
42ff1867bc add test-tokenizers-remote 2025-05-28 09:51:44 +02:00
Sigbjørn Skjæret
2d2e059f4f make common_download_file_single/multiple public 2025-05-28 09:50:41 +02:00
4 changed files with 178 additions and 6 deletions

View File

@@ -244,7 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
}
// download one single file from remote URL to local path
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
@@ -467,7 +467,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (auto const & item : urls) {
@@ -711,12 +711,12 @@ bool common_has_curl() {
return false;
}
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}

View File

@@ -87,3 +87,10 @@ struct common_remote_params {
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
// download one single file from remote URL to local path
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline);
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline);

View File

@@ -97,8 +97,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
if (LLAMA_CURL AND NOT WIN32)
llama_build_and_test(test-tokenizers-remote.cpp WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endif()
if (LLAMA_LLGUIDANCE)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)

View File

@@ -0,0 +1,164 @@
#include "arg.h"
#include "common.h"
#include <string>
#include <fstream>
#include <vector>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
#undef NDEBUG
#include <cassert>
std::string endpoint = "https://huggingface.co/";
std::string repo = "ggml-org/vocabs";
static void write_file(const std::string & fname, const std::string & content) {
std::ofstream file(fname);
if (file) {
file << content;
file.close();
}
}
static json get_hf_repo_dir(const std::string & hf_repo_with_branch, bool recursive, const std::string & repo_path, const std::string & bearer_token) {
auto parts = string_split<std::string>(hf_repo_with_branch, ':');
std::string branch = parts.size() > 1 ? parts.back() : "main";
std::string hf_repo = parts[0];
std::string url = endpoint + "api/models/" + hf_repo + "/tree/" + branch;
std::string path = repo_path;
if (!path.empty()) {
// FIXME: path should be properly url-encoded!
string_replace_all(path, "/", "%2F");
url += "/" + path;
}
if (recursive) {
url += "?recursive=true";
}
// headers
std::vector<std::string> headers;
headers.push_back("Accept: application/json");
if (!bearer_token.empty()) {
headers.push_back("Authorization: Bearer " + bearer_token);
}
// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string cached_response_fname = "test_vocab=" + hf_repo + "/" + repo_path + "=" + branch + ".json";
string_replace_all(cached_response_fname, "/", "_");
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
// make the request
common_remote_params params;
params.headers = headers;
json res_data;
try {
// TODO: For pagination links we need response headers, which is not provided by common_remote_get_content()
auto res = common_remote_get_content(url, params);
long res_code = res.first;
std::string res_str = std::string(res.second.data(), res.second.size());
if (res_code == 200) {
write_file(cached_response_path, res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
} catch (const std::exception & e) {
fprintf(stderr, "error: failed to get repo tree: %s\n", e.what());
fprintf(stderr, "try reading from cache\n");
}
// try to read from cache
try {
std::ifstream f(cached_response_path);
res_data = json::parse(f);
} catch (const std::exception & e) {
fprintf(stderr, "error: failed to get repo tree (check your internet connection)\n");
}
return res_data;
}
int main(void) {
if (common_has_curl()) {
json tree = get_hf_repo_dir(repo, true, {}, {});
if (!tree.empty()) {
std::vector<std::pair<std::string, std::string>> files;
for (const auto & item : tree) {
if (item.at("type") == "file") {
std::string path = item.at("path");
if (string_ends_with(path, ".gguf") || string_ends_with(path, ".gguf.inp") || string_ends_with(path, ".gguf.out")) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filepath = repo + "_" + path;
// to make sure we don't have any slashes in the filename
string_replace_all(filepath, "/", "_");
// to make sure we don't have any quotes in the filename
string_replace_all(filepath, "'", "_");
filepath = fs_get_cache_file(filepath);
files.push_back({endpoint + repo + "/resolve/main/" + path, filepath});
}
}
}
if (!files.empty()) {
bool downloaded = false;
const size_t batch_size = 6;
size_t batches = (files.size() + batch_size - 1) / batch_size;
for (size_t i = 0; i < batches; i++) {
size_t batch_pos = (i * batch_size);
size_t batch_step = batch_pos + batch_size;
auto batch_begin = files.begin() + batch_pos;
auto batch_end = batch_step >= files.size() ? files.end() : files.begin() + batch_step;
std::vector<std::pair<std::string, std::string>> batch(batch_begin, batch_end);
if (!(downloaded = common_download_file_multiple(batch, {}, false))) {
break;
}
}
if (downloaded) {
std::string dir_sep(1, DIRECTORY_SEPARATOR);
for (auto const & item : files) {
std::string filepath = item.second;
if (string_ends_with(filepath, ".gguf")) {
std::string vocab_inp = filepath + ".inp";
std::string vocab_out = filepath + ".out";
auto matching_inp = std::find_if(files.begin(), files.end(), [&vocab_inp](const auto & p) {
return p.second == vocab_inp;
});
auto matching_out = std::find_if(files.begin(), files.end(), [&vocab_out](const auto & p) {
return p.second == vocab_out;
});
if (matching_inp != files.end() && matching_out != files.end()) {
std::string test_command = "." + dir_sep + "test-tokenizer-0 '" + filepath + "'";
assert(std::system(test_command.c_str()) == 0);
} else {
printf("test-tokenizers-remote: %s found without .inp/out vocab files, skipping...\n", filepath.c_str());
}
}
}
} else {
printf("test-tokenizers-remote: failed to download files, unable to perform tests...\n");
}
}
} else {
printf("test-tokenizers-remote: failed to retrieve repository info, unable to perform tests...\n");
}
} else {
printf("test-tokenizers-remote: no curl, unable to perform tests...\n");
}
}