19#ifndef DESCENT_XML_LEX
20#define DESCENT_XML_LEX
61inline ssize_t _descent_xml_lex_mbrtowc(
63 struct libadt_const_lptr
string,
67 if (
string.length <= 0) {
71 return (ssize_t)mbrtowc(
74 (
size_t)
string.length,
82 struct libadt_const_lptr script;
83} _descent_xml_lex_read_t;
85inline bool _descent_xml_lex_read_error(_descent_xml_lex_read_t read)
87 return read.amount < 0
91inline _descent_xml_lex_read_t _descent_xml_lex_read(
92 struct libadt_const_lptr script,
97 mbstate_t mbs = { 0 };
98 _descent_xml_lex_read_t result = { 0 };
99 result.amount = _descent_xml_lex_mbrtowc(&c, script, &mbs);
100 if (_descent_xml_lex_read_error(result))
105 result.script = libadt_const_lptr_index(script, (ssize_t)result.amount);
122 struct libadt_const_lptr
script
132inline bool _descent_xml_lex_startswith(
133 struct libadt_const_lptr
string,
134 struct libadt_const_lptr start
137 if (
string.size != start.size)
139 if (
string.length < start.length)
142 return libadt_const_lptr_equal(
143 libadt_const_lptr_truncate(
string, (
size_t)start.length),
148inline ssize_t _descent_xml_lex_count_spaces(
149 struct libadt_const_lptr next
154 mbstate_t mbstate = { 0 };
156 ssize_t current = _descent_xml_lex_mbrtowc(&c, next, &mbstate);
158 next = libadt_const_lptr_index(next, current),
159 current = _descent_xml_lex_mbrtowc(&c, next, &mbstate)
161 const bool unexpected = c == L
'\0'
172inline struct libadt_const_lptr _descent_xml_lex_remainder(
176 return libadt_const_lptr_after(token.script, token.value);
183 _descent_xml_lex_section *section
195 token.type = result.
type;
203 _descent_xml_lex_section *left,
204 _descent_xml_lex_section *right
209 result = descent_xml_lex_then(token, right);
215 _descent_xml_lex_section *section
219 = descent_xml_lex_then(token, section);
229 struct libadt_const_lptr remainder
230 = _descent_xml_lex_remainder(token);
231 ssize_t spaces = _descent_xml_lex_count_spaces(remainder);
236 token.value.length += spaces;
244 struct libadt_const_lptr remainder
245 = _descent_xml_lex_remainder(token);
246 _descent_xml_lex_read_t read
247 = _descent_xml_lex_read(remainder, descent_xml_classifier_element);
250 token.type = read.type;
255 while (read.type == descent_xml_classifier_element_name) {
256 if (_descent_xml_lex_read_error(read)) {
261 total += read.amount;
262 read = _descent_xml_lex_read(read.script, read.type);
265 token.value.length += total;
273 struct libadt_const_lptr remainder
274 = _descent_xml_lex_remainder(token);
276 if (*(
char*)remainder.buffer !=
'=') {
279 token.value.length++;
288 struct libadt_const_lptr remainder
289 = _descent_xml_lex_remainder(token);
290 _descent_xml_lex_read_t read
291 = _descent_xml_lex_read(
293 descent_xml_classifier_attribute_assign
298 = read.type == descent_xml_classifier_attribute_value_single_quote_start
299 || read.type == descent_xml_classifier_attribute_value_double_quote_start;
300 if (_descent_xml_lex_read_error(read) || !quote) {
308 = read.type == descent_xml_classifier_attribute_value_single_quote_end
309 || read.type == descent_xml_classifier_attribute_value_double_quote_end;
314 if (_descent_xml_lex_read_error(read) || error) {
319 total += read.amount;
321 read = _descent_xml_lex_read(read.script, read.type);
323 = read.type == descent_xml_classifier_attribute_value_single_quote_end
324 || read.type == descent_xml_classifier_attribute_value_double_quote_end;
332 token.value.length += total;
340 struct libadt_const_lptr remainder
341 = _descent_xml_lex_remainder(token);
342 const struct libadt_const_lptr
343 doctypedecl = libadt_str_literal(
"!DOCTYPE");
345 if (!_descent_xml_lex_startswith(remainder, doctypedecl)) {
350 token.value.length += doctypedecl.length;
358 struct libadt_const_lptr remainder
359 = _descent_xml_lex_remainder(token);
360 const struct libadt_const_lptr
361 doctypedecl = libadt_str_literal(
"?xml");
363 if (!_descent_xml_lex_startswith(remainder, doctypedecl)) {
368 token.value.length += doctypedecl.length;
377 struct libadt_const_lptr remainder
378 = _descent_xml_lex_remainder(token);
379 const struct libadt_const_lptr
380 systemid = libadt_str_literal(
"SYSTEM");
382 if (!_descent_xml_lex_startswith(remainder, systemid)) {
387 token.value.length += systemid.length;
388 token = descent_xml_lex_then(token, _descent_xml_lex_space);
389 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
397 struct libadt_const_lptr remainder
398 = _descent_xml_lex_remainder(token);
399 const struct libadt_const_lptr
400 publicid = libadt_str_literal(
"PUBLIC");
402 if (!_descent_xml_lex_startswith(remainder, publicid)) {
407 token.value.length += publicid.length;
408 token = descent_xml_lex_then(token, _descent_xml_lex_space);
409 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
410 token = descent_xml_lex_then(token, _descent_xml_lex_space);
411 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
419 token = descent_xml_lex_then(
421 _descent_xml_lex_space
426 token = descent_xml_lex_or(
428 _descent_xml_lex_doctype_system,
429 _descent_xml_lex_doctype_public
438 token = descent_xml_lex_then(token, _descent_xml_lex_doctype_str);
439 token = descent_xml_lex_then(token, _descent_xml_lex_space);
440 token = descent_xml_lex_then(token, _descent_xml_lex_name);
441 token = descent_xml_lex_optional(
443 _descent_xml_lex_doctype_extrawurst
445 token = descent_xml_lex_optional(
447 _descent_xml_lex_space
453 token.value = libadt_const_lptr_index(token.value, 1);
454 token.type = descent_xml_lex_doctype;
462 token = descent_xml_lex_then(token, _descent_xml_lex_space);
463 token = descent_xml_lex_then(token, _descent_xml_lex_name);
464 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
465 token = descent_xml_lex_then(token, _descent_xml_lex_assign);
466 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
467 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
475 token = descent_xml_lex_then(token, _descent_xml_lex_xmldecl_str);
476 token = descent_xml_lex_then(token, _descent_xml_lex_attribute_value);
480 (next = descent_xml_lex_then(
482 _descent_xml_lex_attribute_value
487 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
492 struct libadt_const_lptr remainder = _descent_xml_lex_remainder(token);
494 if (*(
char*)remainder.buffer !=
'?') {
498 token.value.length++;
501 token.value = libadt_const_lptr_index(token.value, 1);
502 token.type = descent_xml_lex_xmldecl;
510 return descent_xml_lex_or(
512 descent_xml_lex_handle_xmldecl,
513 descent_xml_lex_handle_doctype
521 struct libadt_const_lptr remainder
522 = _descent_xml_lex_remainder(token);
523 const struct libadt_const_lptr cdata
524 = libadt_str_literal(
"![CDATA[");
526 if (!_descent_xml_lex_startswith(remainder, cdata)) {
531 total += cdata.length;
532 remainder = libadt_const_lptr_index(remainder, cdata.length);
534 const struct libadt_const_lptr cdata_end
535 = libadt_str_literal(
"]]");
537 while (!_descent_xml_lex_startswith(remainder, cdata_end)) {
538 if (remainder.length <= 0) {
543 remainder = libadt_const_lptr_index(remainder, 1);
546 total += cdata_end.length;
547 token.type = descent_xml_lex_cdata;
548 token.value = libadt_const_lptr_index(token.value, 1);
549 token.value.length += total;
557 struct libadt_const_lptr remainder
558 = _descent_xml_lex_remainder(token);
559 const struct libadt_const_lptr comment
560 = libadt_str_literal(
"!--");
562 if (!_descent_xml_lex_startswith(remainder, comment)) {
567 total += comment.length;
568 remainder = libadt_const_lptr_index(remainder, comment.length);
570 const struct libadt_const_lptr comment_end
571 = libadt_str_literal(
"--");
573 while (!_descent_xml_lex_startswith(remainder, comment_end)) {
574 if (remainder.length <= 0) {
579 remainder = libadt_const_lptr_index(remainder, 1);
582 total += comment_end.length;
583 token.type = descent_xml_lex_comment;
584 token.value = libadt_const_lptr_index(token.value, 1);
585 token.value.length += total;
593 return descent_xml_lex_or(
595 descent_xml_lex_handle_comment,
596 descent_xml_lex_handle_cdata
612 struct libadt_const_lptr next = _descent_xml_lex_remainder(token);
614 if (token.type == descent_xml_classifier_element) {
619 _descent_xml_lex_handle_prolog,
620 _descent_xml_lex_handle_unmarkdown
626 _descent_xml_lex_read_t
627 read = _descent_xml_lex_read(next, token.type),
628 previous_read = read;
630 if (_descent_xml_lex_read_error(read))
632 .script = token.script,
634 .value = libadt_const_lptr_truncate(next, 0),
639 .script = token.script,
641 .value = libadt_const_lptr_truncate(next, (
size_t)read.amount)
645 ssize_t value_length = read.amount;
647 read = _descent_xml_lex_read(read.script, read.type);
648 !_descent_xml_lex_read_error(read);
649 read = _descent_xml_lex_read(read.script, read.type)
651 if (read.type != previous_read.type)
654 previous_read = read;
655 value_length += read.amount;
659 .script = token.script,
660 .type = previous_read.type,
661 .value = libadt_const_lptr_truncate(next, (
size_t)value_length),
descent_xml_classifier_void_fn * descent_xml_classifier_start(wchar_t input)
void descent_xml_classifier_void_fn(void)
Definition classifier.h:59
descent_xml_classifier_fn *const descent_xml_classifier_unexpected
descent_xml_classifier_void_fn * descent_xml_classifier_fn(wchar_t input)
Definition classifier.h:67
descent_xml_classifier_fn *const descent_xml_classifier_eof
struct descent_xml_lex descent_xml_lex_next_raw(struct descent_xml_lex token)
Returns the next, raw token in the script referred to by previous.
Definition lex.h:608
struct descent_xml_lex descent_xml_lex_init(struct libadt_const_lptr script)
Initializes a token object for use in descent_xml_lex_next().
Definition lex.h:121
Represents a single token.
Definition lex.h:42
struct libadt_const_lptr script
A pointer to the full script.
Definition lex.h:51
struct libadt_const_lptr value
A pointer to the classifiered value.
Definition lex.h:58
descent_xml_classifier_fn * type
Represents the type of token classifiered.
Definition lex.h:46