mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
* common : implement parser combinators to simplify chat parsing * add virtual destructor to parser_base * fix memory leak from circular references of rules * implement gbnf grammar building * remove unused private variable * create a base visitor and implement id assignment as a visitor * fix const ref for grammar builder * clean up types, friend classes, and class declarations * remove builder usage from until_parser * Use a counter class to help assign rule ids * cache everything * add short description for each parser * create a type for the root parser * implement repetition parser * Make optional, one_or_more, and zero_or_more subclasses of repetition * improve context constructor * improve until parsing and add benchmarks * remove cached() pattern, cache in parser_base with specialized parsing functions for each parser * improve json parsing performance to better match legacy parsing * fix const auto * it for windows * move id assignment to classes instead of using a visitor * create named rules in the command r7b example * use '.' for any in GBNF * fix parens around choices in gbnf grammar * add convenience operators to turn strings to literals * add free-form operators for const char * to simplify defining literals * simplify test case parser * implement semantic actions * remove groups in favor of actions and a scratchpad * add built in actions for common operations * add actions to command r7b example * use std::default_searcher for platforms that don't have bm * improve parser_type handling and add cast helper * add partial result type to better control when to run actions * fix bug in until() * run actions on partial results by default * use common_chat_msg for result * add qwen3 example wip * trash partial idea and simplify * move action arguments to a struct * implement aho-corasick matcher for until_parser and to build exclusion grammars * use std::string for input, since std::string_view is incompatible with std::regex * Refactor tests * improve qwen3 example * implement sax-style parsing and refactor * fix json string in test * rename classes to use common_chat_ prefix * remove is_ suffix from functions * rename from id_counter to just counter * Final refactored tests * Fix executable name and editorconfig-checker * Third time's the charm... * add trigger parser to begin lazy grammar rule generation * working lazy grammar * refactor json rules now that we check for reachability * reduce pointer usage * print out grammars in example * rename to chat-peg-parser* and common_chat_peg_parser* * Revert unrelated changes * New macros for CMakeLists to enable multi-file compilations * starting unicode support * add unicode support to char_parser * use unparsed args as additional sources * Refactor tests to new harness * Fix CMakeLists * fix rate calculation * add unicode tests * fix trailing whitespace and line endings skip-checks: true * Helpers + rewrite qwen3 with helpers * Fix whitespace * extract unicode functions to separate file * refactor parse unicode function * fix compiler error * improve construction of sequence/choice parsers * be less clever * add make_parser helper function * expand usage of make_parser, alias common_chat_msg_peg_parser_builder to builder in source * lower bench iterations * add unicode support to until_parser * add unicode support to json_string_parser * clean up unicode tests * reduce unicode details to match src/unicode.cpp * simplify even further * remove unused functions * fix type * reformat char class parsing * clean up json string parser * clean up + fix diagnostics * reorder includes * compact builder functions * replace action_parser with capture_parser, rename env to semantics * rename env to semantics * clean up common_chat_parse_context * move type() to below constant * use default constructor for common_chat_peg_parser * make all operators functions for consistency * fix compilation errors in test-optional.cpp * simplify result values * rename json_string_unquoted to json_string_content * Move helper to separate class, add separate explicit and helper classes * Whitespace * Change + to append() * Reformat * Add extra helpers, tests and Minimax example * Add some extra optional debugging prints + real example of how to use them * fix bug in repetitions when min_count = 0 reports failures * dump rule in debug * fix token accumulation and assert parsing never fails * indent debug by depth * use LOG_* in tests so logs sync up with test logs * - Add selective testing - Refactor all messaging to use LOG_ERR - Fix lack of argument / tool name capturing - Temporary fix for double event capture * refactor rule() and introduce ref() * clean up visitor * clean up indirection in root parser w.r.t rules * store shared ptr directly in parser classes * replace aho-corasick automation with a simple trie * Reset prev for qwen3 helper example variant * refactor to use value semantics with std::variant/std::visit * simplify trie_matcher result * fix linting issues * add annotations to rules * revert test workaround * implement serializing the parser * remove redundant parsers * remove tests * gbnf generation fixes * remove LOG_* use in tests * update gbnf tests to test entire grammar * clean up gbnf generation and fix a few bugs * fix typo in test output * remove implicit conversion rules * improve test output * rename trie_matcher to trie * simplify trie to just know if a node is the end of a word * remove common_chat_ prefix and ensure a common_peg_ prefix to all types * rename chat-peg-parser -> peg-parser * promote chat-peg-parser-helper to chat-peg-parser * checkpoint * use a static_assert to ensure we handle every branch * inline trivial peg parser builders * use json strings for now * implement basic and native chat peg parser builders/extractors * resolve refs to their rules * remove packrat caching (for now) * update tests * compare parsers with incremental input * benchmark both complete and incremental parsing * add raw string generation from json schema * add support for string schemas in gbnf generation * fix qwen example to include \n * tidy up example * rename extractor to mapper * rename ast_arena to ast * place basic tests into one * use gbnf_format_literal from json-schema-to-grammar * integrate parser with common/chat and server * clean up schema and serialization * add json-schema raw string tests * clean up json creation and remove capture parser * trim spaces from reasoning and content * clean up redundant rules and comments * rename input_is_complete to is_partial to match rest of project * simplify json rules * remove extraneous file * remove comment * implement += and |= operators * add comments to qwen3 implementation * reorder arguments to common_chat_peg_parse * remove commented outdated tests * add explicit copy constructor * fix operators and constness * wip: update test-chat for qwen3-coder * bring json parser closer to json-schema-to-grammar rules * trim trailing space for most things * fix qwen3 coder rules w.r.t. trailing spaces * group rules * do not trim trailing space from string args * tweak spacing of qwen3 grammar * update qwen3-coder tests * qwen3-coder small fixes * place parser in common_chat_syntax to simplify invocation * use std::set to collect rules to keep order predictable for tests * initialize parser to make certain platforms happy * revert back to std::unordered_set, sort rule names at the end instead * uncomment rest of chat tests * define explicit default constructor * improve arena init and server integration * fix chat test * add json_member() * add a comprehensive native example * clean up example qwen test and add response_format example to native test * make build_peg_parser accept std::function instead of template * change peg parser parameters into const ref * push tool call on tool open for constructed parser * add parsing documentation * clean up some comments * add json schema support to qwen3-coder * add id initializer in tests * remove grammar debug line from qwen3-coder * refactor qwen3-coder to use sequence over operators * only call common_chat_peg_parse if appropriate format * simplify qwen3-coder space handling * revert qwen3-coder implementation * revert json-schema-to-grammar changes * remove unnecessary forward declaration * small adjustment to until_parser * rename C/C++ files to use dashes * codeowners : add aldehir to peg-parser and related files --------- Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com>
450 lines
20 KiB
C++
450 lines
20 KiB
C++
#include "tests.h"
|
|
|
|
#include "peg-parser.h"
|
|
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <iomanip>
|
|
#include <cctype>
|
|
|
|
static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
|
|
t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
|
|
}
|
|
|
|
static std::string hex_dump(const std::string& str) {
|
|
std::ostringstream oss;
|
|
for (unsigned char c : str) {
|
|
if (std::isprint(c)) {
|
|
oss << c;
|
|
} else {
|
|
oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
|
|
}
|
|
}
|
|
return oss.str();
|
|
}
|
|
|
|
void test_unicode(testing &t) {
|
|
struct test_case {
|
|
std::string input;
|
|
std::string expected_text;
|
|
common_peg_parse_result_type expected_result;
|
|
};
|
|
|
|
t.test("any", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Valid UTF-8 sequences
|
|
{"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
{std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
{std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Incomplete UTF-8 sequences (partial bytes at end)
|
|
{std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
{std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
{std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Invalid/malformed UTF-8 sequences
|
|
{std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
{std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.one_or_more(p.any()), p.end()});
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
// Assert result type matches
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
// Assert matched text if success or need_more_input
|
|
if (result.success() || result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("char classes", [](testing &t) {
|
|
t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Within range - CJK Unified Ideographs
|
|
{std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
|
|
{std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
|
|
{std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
|
|
{std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
|
|
|
|
// Outside range - should fail
|
|
{"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
|
|
{std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
|
|
{std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
|
|
|
|
// Incomplete sequences in range
|
|
{std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
|
|
{std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
// Assert result type matches
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
// Assert matched text if success or need_more_input
|
|
if (result.success() || result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Within range - Emoticons (all 4-byte UTF-8)
|
|
{std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
|
|
{std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
|
|
{std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
|
|
|
|
// Outside range
|
|
{std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
|
|
{std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
|
|
{std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
|
|
|
|
// Incomplete sequences
|
|
{std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
|
|
{std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
// Assert result type matches
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
// Assert matched text if success or need_more_input
|
|
if (result.success() || result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("mixed unicode ranges", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Match CJK
|
|
{std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
|
|
{std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
|
|
|
|
// Match emoticons
|
|
{std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
|
|
|
|
// Match ASCII digits
|
|
{"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Don't match outside any range
|
|
{"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
{std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
|
|
|
|
// Incomplete
|
|
{std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
{std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
// Assert result type matches
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
// Assert matched text if success or need_more_input
|
|
if (result.success() || result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
});
|
|
|
|
t.test("until parser", [](testing &t) {
|
|
t.test("ASCII delimiter with Unicode content", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// CJK characters before delimiter
|
|
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Emoji before delimiter
|
|
{std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Mixed content
|
|
{std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.until("</tag>");
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, false);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
if (result.success()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("incomplete UTF-8 at end", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Incomplete emoji at end, no delimiter
|
|
{std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Incomplete CJK at end, no delimiter
|
|
{std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Complete content, no delimiter (should consume all valid UTF-8)
|
|
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.until("</tag>");
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
if (result.success() || result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("malformed UTF-8", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Invalid UTF-8 bytes
|
|
{std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
|
|
// Continuation byte without lead byte
|
|
{std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
|
|
// Invalid continuation byte
|
|
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
};
|
|
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.until("</tag>");
|
|
});
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
common_peg_parse_context ctx(tc.input, false);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
});
|
|
}
|
|
});
|
|
});
|
|
|
|
t.test("json_string parser", [](testing &t) {
|
|
t.test("valid UTF-8 characters", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// ASCII only
|
|
{"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// 2-byte UTF-8 (accented characters)
|
|
{std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// 3-byte UTF-8 (CJK)
|
|
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// 4-byte UTF-8 (emoji)
|
|
{std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Mixed content
|
|
{std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
};
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.json_string_content(), p.literal("\"")});
|
|
});
|
|
|
|
common_peg_parse_context ctx(tc.input, false);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
if (result.success()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("incomplete UTF-8", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Incomplete 2-byte sequence
|
|
{std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Incomplete 3-byte sequence
|
|
{std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Incomplete 4-byte sequence
|
|
{std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
|
|
// Incomplete at very start
|
|
{std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
|
|
};
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.json_string_content();
|
|
});
|
|
|
|
common_peg_parse_context ctx(tc.input, true);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
if (result.need_more_input()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start);
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("malformed UTF-8", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Invalid UTF-8 bytes
|
|
{std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
|
|
// Continuation byte without lead byte
|
|
{std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
|
|
// Invalid continuation byte
|
|
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
|
|
// Overlong encoding (security issue)
|
|
{std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
|
|
};
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.json_string_content();
|
|
});
|
|
|
|
common_peg_parse_context ctx(tc.input, false);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
});
|
|
}
|
|
});
|
|
|
|
t.test("escape sequences with UTF-8", [](testing &t) {
|
|
std::vector<test_case> test_cases {
|
|
// Unicode escape sequence
|
|
{"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Mix of UTF-8 and escape sequences
|
|
{std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
|
|
// Escaped quote in UTF-8 string
|
|
{std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
|
|
};
|
|
|
|
for (size_t i = 0; i < test_cases.size(); i++) {
|
|
const auto & tc = test_cases[i];
|
|
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
|
|
|
|
t.test(test_name, [&](testing &t) {
|
|
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
|
|
return p.sequence({p.json_string_content(), p.literal("\"")});
|
|
});
|
|
|
|
common_peg_parse_context ctx(tc.input, false);
|
|
auto result = parser.parse(ctx);
|
|
|
|
assert_result_equal(t, tc.expected_result, result.type);
|
|
|
|
if (result.success()) {
|
|
std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
|
|
t.assert_equal(tc.expected_text, matched);
|
|
}
|
|
});
|
|
}
|
|
});
|
|
});
|
|
}
|