|
#ifdef NDEBUG |
|
#undef NDEBUG |
|
#endif |
|
|
|
#include "llama.h" |
|
#include "llama-grammar.h" |
|
|
|
#include <cassert> |
|
|
|
static const char * type_str(llama_gretype type) { |
|
switch (type) { |
|
case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR"; |
|
case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT"; |
|
case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT"; |
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER"; |
|
case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF"; |
|
case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT"; |
|
case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END"; |
|
default: return "?"; |
|
} |
|
} |
|
|
|
static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) { |
|
uint32_t index = 0; |
|
llama_grammar_parser parsed_grammar; |
|
parsed_grammar.parse(grammar_bytes); |
|
|
|
std::map<uint32_t, std::string> symbol_names; |
|
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) { |
|
symbol_names[it->second] = it->first; |
|
} |
|
|
|
auto print_all = [&]() { |
|
fprintf(stderr, " verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes); |
|
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) { |
|
fprintf(stderr, " {\"%s\", %u},\n", it->first.c_str(), it->second); |
|
} |
|
fprintf(stderr, " }, {\n"); |
|
for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) { |
|
fprintf(stderr, " // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule); |
|
auto & rule = parsed_grammar.rules[i_rule]; |
|
for (uint32_t i = 0; i < rule.size(); i++) { |
|
std::string rule_str; |
|
fprintf(stderr, " {%s, ", type_str(rule[i].type)); |
|
if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT || |
|
rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { |
|
char c = rule[i].value; |
|
if (c == '\n') { |
|
fprintf(stderr, "'\\n'"); |
|
} else if (c == '\t') { |
|
fprintf(stderr, "'\\t'"); |
|
} else if (c == '\r') { |
|
fprintf(stderr, "'\\r'"); |
|
} else if (c == '\0') { |
|
fprintf(stderr, "'\\0'"); |
|
} else { |
|
fprintf(stderr, "'%c'", c); |
|
} |
|
} else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) { |
|
fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value); |
|
} else { |
|
fprintf(stderr, "%u", rule[i].value); |
|
} |
|
fprintf(stderr, "},\n"); |
|
} |
|
} |
|
fprintf(stderr, " });\n"); |
|
}; |
|
|
|
if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) { |
|
print_all(); |
|
fprintf(stderr, "\n"); |
|
return; |
|
} |
|
|
|
fprintf(stderr, "Testing grammar:%s\n", grammar_bytes); |
|
|
|
if (parsed_grammar.symbol_ids.size() != expected.size()) { |
|
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
|
print_all(); |
|
assert(parsed_grammar.symbol_ids.size() == expected.size()); |
|
} |
|
|
|
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) |
|
{ |
|
std::string key = it->first; |
|
uint32_t value = it->second; |
|
std::pair<std::string, uint32_t> expected_pair = expected[index]; |
|
|
|
|
|
if (expected_pair.first != key || expected_pair.second != value) |
|
{ |
|
fprintf(stderr, "index: %u\n", index); |
|
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second); |
|
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value); |
|
fprintf(stderr, "expected_pair != actual_pair\n"); |
|
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
|
print_all(); |
|
} |
|
|
|
assert(expected_pair.first == key && expected_pair.second == value); |
|
|
|
index++; |
|
} |
|
|
|
index = 0; |
|
for (auto rule : parsed_grammar.rules) |
|
{ |
|
|
|
for (uint32_t i = 0; i < rule.size(); i++) |
|
{ |
|
llama_grammar_element element = rule[i]; |
|
llama_grammar_element expected_element = expected_rules[index]; |
|
|
|
|
|
if (expected_element.type != element.type || expected_element.value != element.value) |
|
{ |
|
fprintf(stderr, "index: %u\n", index); |
|
fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value); |
|
fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value); |
|
fprintf(stderr, "expected_element != actual_element\n"); |
|
fprintf(stderr, "all elements:\n"); |
|
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n"); |
|
print_all(); |
|
} |
|
|
|
assert(expected_element.type == element.type && expected_element.value == element.value); |
|
index++; |
|
} |
|
} |
|
} |
|
|
|
static void verify_failure(const char * grammar_bytes) { |
|
fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes); |
|
llama_grammar_parser result; |
|
result.parse(grammar_bytes); |
|
assert(result.rules.empty() && "should have failed"); |
|
} |
|
|
|
int main() |
|
{ |
|
verify_failure(R"""( |
|
root ::= "a"{,}" |
|
)"""); |
|
|
|
verify_failure(R"""( |
|
root ::= "a"{,10}" |
|
)"""); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a" |
|
)""", { |
|
{"root", 0}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a" | [bdx-z] | [^1-3] |
|
)""", { |
|
{"root", 0}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_CHAR, 'b'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, 'd'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, 'x'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_CHAR_NOT, '1'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= a+ |
|
a ::= "a" |
|
)""", { |
|
{"a", 1}, |
|
{"root", 0}, |
|
{"root_2", 2}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"+ |
|
)""", { |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= a? |
|
a ::= "a" |
|
)""", { |
|
{"a", 1}, |
|
{"root", 0}, |
|
{"root_2", 2}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"? |
|
)""", { |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= a* |
|
a ::= "a" |
|
)""", { |
|
{"a", 1}, |
|
{"root", 0}, |
|
{"root_2", 2}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"* |
|
)""", { |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"{2} |
|
)""", { |
|
{"root", 0}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"{2,} |
|
)""", { |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"{ 4} |
|
)""", { |
|
{"root", 0}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= "a"{2,4} |
|
)""", { |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
{"root_2", 2}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= (expr "=" term "\n")+ |
|
expr ::= term ([-+*/] term)* |
|
term ::= [0-9]+ |
|
)""", { |
|
{"expr", 2}, |
|
{"expr_5", 5}, |
|
{"expr_6", 6}, |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
{"root_4", 4}, |
|
{"term", 3}, |
|
{"term_7", 7}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 4}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_CHAR, '='}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_CHAR, '\n'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_RULE_REF, 6}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '0'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
|
{LLAMA_GRETYPE_RULE_REF, 7}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 4}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '-'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '+'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '*'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '/'}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 5}, |
|
{LLAMA_GRETYPE_RULE_REF, 6}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '0'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
|
{LLAMA_GRETYPE_RULE_REF, 7}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
verify_parsing(R"""( |
|
root ::= (expr "=" ws term "\n")+ |
|
expr ::= term ([-+*/] term)* |
|
term ::= ident | num | "(" ws expr ")" ws |
|
ident ::= [a-z] [a-z0-9_]* ws |
|
num ::= [0-9]+ ws |
|
ws ::= [ \t\n]* |
|
)""", { |
|
{"expr", 2}, |
|
{"expr_6", 6}, |
|
{"expr_7", 7}, |
|
{"ident", 8}, |
|
{"ident_10", 10}, |
|
{"num", 9}, |
|
{"num_11", 11}, |
|
{"root", 0}, |
|
{"root_1", 1}, |
|
{"root_5", 5}, |
|
{"term", 4}, |
|
{"ws", 3}, |
|
{"ws_12", 12}, |
|
}, { |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 5}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_CHAR, '='}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_RULE_REF, 4}, |
|
{LLAMA_GRETYPE_CHAR, '\n'}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 4}, |
|
{LLAMA_GRETYPE_RULE_REF, 7}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 12}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 8}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_RULE_REF, 9}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_CHAR, '('}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_RULE_REF, 2}, |
|
{LLAMA_GRETYPE_CHAR, ')'}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 1}, |
|
{LLAMA_GRETYPE_RULE_REF, 5}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '-'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '+'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '*'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '/'}, |
|
{LLAMA_GRETYPE_RULE_REF, 4}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_RULE_REF, 6}, |
|
{LLAMA_GRETYPE_RULE_REF, 7}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
|
{LLAMA_GRETYPE_RULE_REF, 10}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '0'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
|
{LLAMA_GRETYPE_RULE_REF, 11}, |
|
{LLAMA_GRETYPE_RULE_REF, 3}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, 'a'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '0'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '_'}, |
|
{LLAMA_GRETYPE_RULE_REF, 10}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, '0'}, |
|
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'}, |
|
{LLAMA_GRETYPE_RULE_REF, 11}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
|
|
{LLAMA_GRETYPE_CHAR, ' '}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '\t'}, |
|
{LLAMA_GRETYPE_CHAR_ALT, '\n'}, |
|
{LLAMA_GRETYPE_RULE_REF, 12}, |
|
{LLAMA_GRETYPE_ALT, 0}, |
|
{LLAMA_GRETYPE_END, 0}, |
|
}); |
|
|
|
return 0; |
|
} |
|
|