Descent XML
An XML Parser Helper Library
Loading...
Searching...
No Matches
lex.h
Go to the documentation of this file.
1/*
2 * XMLTree - An XML Parser-Helper Library
3 * Copyright (C) 2025 Marcus Harrison
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 */
18
19#ifndef DESCENT_XML_LEX
20#define DESCENT_XML_LEX
21
22#ifdef __cplusplus
23extern "C" {
24#endif
25
26
27
28#include <wchar.h>
29#include <wctype.h>
30
31#include <libadt.h>
32
33#include "classifier.h"
34
47
51 struct libadt_const_lptr script;
52
58 struct libadt_const_lptr value;
59};
60
61inline ssize_t _descent_xml_lex_mbrtowc(
62 wchar_t *result,
63 struct libadt_const_lptr string,
64 mbstate_t *_mbstate
65)
66{
67 if (string.length <= 0) {
68 *result = L'\0';
69 return 0;
70 }
71 return (ssize_t)mbrtowc(
72 result,
73 string.buffer,
74 (size_t)string.length,
75 _mbstate
76 );
77}
78
79typedef struct {
80 ssize_t amount;
82 struct libadt_const_lptr script;
83} _descent_xml_lex_read_t;
84
85inline bool _descent_xml_lex_read_error(_descent_xml_lex_read_t read)
86{
87 return read.amount < 0
89}
90
91inline _descent_xml_lex_read_t _descent_xml_lex_read(
92 struct libadt_const_lptr script,
93 descent_xml_classifier_fn *const previous
94)
95{
96 wchar_t c = 0;
97 mbstate_t mbs = { 0 };
98 _descent_xml_lex_read_t result = { 0 };
99 result.amount = _descent_xml_lex_mbrtowc(&c, script, &mbs);
100 if (_descent_xml_lex_read_error(result))
102 else
103 result.type = (descent_xml_classifier_fn*)previous(c);
104
105 result.script = libadt_const_lptr_index(script, (ssize_t)result.amount);
106 return result;
107}
108
109descent_xml_classifier_void_fn *descent_xml_lex_doctype(wchar_t input);
110descent_xml_classifier_void_fn *descent_xml_lex_xmldecl(wchar_t input);
111descent_xml_classifier_void_fn *descent_xml_lex_cdata(wchar_t input);
112descent_xml_classifier_void_fn *descent_xml_lex_comment(wchar_t input);
113
122 struct libadt_const_lptr script
123)
124{
125 return (struct descent_xml_lex) {
127 .script = script,
128 .value = libadt_const_lptr_truncate(script, 0),
129 };
130}
131
132inline bool _descent_xml_lex_startswith(
133 struct libadt_const_lptr string,
134 struct libadt_const_lptr start
135)
136{
137 if (string.size != start.size)
138 return false;
139 if (string.length < start.length)
140 return false;
141
142 return libadt_const_lptr_equal(
143 libadt_const_lptr_truncate(string, (size_t)start.length),
144 start
145 );
146}
147
148inline ssize_t _descent_xml_lex_count_spaces(
149 struct libadt_const_lptr next
150)
151{
152 ssize_t spaces = 0;
153 wchar_t c = 0;
154 mbstate_t mbstate = { 0 };
155 for (
156 ssize_t current = _descent_xml_lex_mbrtowc(&c, next, &mbstate);
157 iswspace((wint_t)c);
158 next = libadt_const_lptr_index(next, current),
159 current = _descent_xml_lex_mbrtowc(&c, next, &mbstate)
160 ) {
161 const bool unexpected = c == L'\0'
162 || current < 0;
163 if (unexpected)
164 return -1;
165
166 spaces += current;
167 }
168
169 return spaces;
170}
171
172inline struct libadt_const_lptr _descent_xml_lex_remainder(
173 struct descent_xml_lex token
174)
175{
176 return libadt_const_lptr_after(token.script, token.value);
177}
178
179typedef struct descent_xml_lex _descent_xml_lex_section(struct descent_xml_lex);
180
181inline struct descent_xml_lex descent_xml_lex_then(
182 struct descent_xml_lex token,
183 _descent_xml_lex_section *section
184)
185{
186 if (
188 || token.type == descent_xml_classifier_eof
189 )
190 return token;
191
192 struct descent_xml_lex result = section(token);
193
195 token.type = result.type;
196 return token;
197 }
198 return result;
199}
200
201inline struct descent_xml_lex descent_xml_lex_or(
202 struct descent_xml_lex token,
203 _descent_xml_lex_section *left,
204 _descent_xml_lex_section *right
205)
206{
207 struct descent_xml_lex result = descent_xml_lex_then(token, left);
209 result = descent_xml_lex_then(token, right);
210 return result;
211}
212
213inline struct descent_xml_lex descent_xml_lex_optional(
214 struct descent_xml_lex token,
215 _descent_xml_lex_section *section
216)
217{
218 struct descent_xml_lex result
219 = descent_xml_lex_then(token, section);
221 return token;
222 return result;
223}
224
225inline struct descent_xml_lex _descent_xml_lex_space(
226 struct descent_xml_lex token
227)
228{
229 struct libadt_const_lptr remainder
230 = _descent_xml_lex_remainder(token);
231 ssize_t spaces = _descent_xml_lex_count_spaces(remainder);
232 if (spaces <= 0) {
234 return token;
235 }
236 token.value.length += spaces;
237 return token;
238}
239
240inline struct descent_xml_lex _descent_xml_lex_name(
241 struct descent_xml_lex token
242)
243{
244 struct libadt_const_lptr remainder
245 = _descent_xml_lex_remainder(token);
246 _descent_xml_lex_read_t read
247 = _descent_xml_lex_read(remainder, descent_xml_classifier_element);
248
249 if (read.type == descent_xml_classifier_unexpected) {
250 token.type = read.type;
251 return token;
252 }
253
254 ssize_t total = 0;
255 while (read.type == descent_xml_classifier_element_name) {
256 if (_descent_xml_lex_read_error(read)) {
258 return token;
259 }
260
261 total += read.amount;
262 read = _descent_xml_lex_read(read.script, read.type);
263 }
264
265 token.value.length += total;
266 return token;
267}
268
269inline struct descent_xml_lex _descent_xml_lex_assign(
270 struct descent_xml_lex token
271)
272{
273 struct libadt_const_lptr remainder
274 = _descent_xml_lex_remainder(token);
275 // TODO: do this properly
276 if (*(char*)remainder.buffer != '=') {
278 } else {
279 token.value.length++;
280 }
281 return token;
282}
283
284inline struct descent_xml_lex _descent_xml_lex_quote_string(
285 struct descent_xml_lex token
286)
287{
288 struct libadt_const_lptr remainder
289 = _descent_xml_lex_remainder(token);
290 _descent_xml_lex_read_t read
291 = _descent_xml_lex_read(
292 remainder,
293 descent_xml_classifier_attribute_assign
294 );
295
296 // these names are too fucking long
297 const bool quote
298 = read.type == descent_xml_classifier_attribute_value_single_quote_start
299 || read.type == descent_xml_classifier_attribute_value_double_quote_start;
300 if (_descent_xml_lex_read_error(read) || !quote) {
302 return token;
303 }
304
305 ssize_t total = 0;
306
307 bool end_quote
308 = read.type == descent_xml_classifier_attribute_value_single_quote_end
309 || read.type == descent_xml_classifier_attribute_value_double_quote_end;
310 bool error
312 || read.type == descent_xml_classifier_eof;
313 while (!end_quote) {
314 if (_descent_xml_lex_read_error(read) || error) {
316 return token;
317 }
318
319 total += read.amount;
320
321 read = _descent_xml_lex_read(read.script, read.type);
322 end_quote
323 = read.type == descent_xml_classifier_attribute_value_single_quote_end
324 || read.type == descent_xml_classifier_attribute_value_double_quote_end;
325 error
327 || read.type == descent_xml_classifier_eof;
328 }
329
330 total++;
331
332 token.value.length += total;
333 return token;
334}
335
336inline struct descent_xml_lex _descent_xml_lex_doctype_str(
337 struct descent_xml_lex token
338)
339{
340 struct libadt_const_lptr remainder
341 = _descent_xml_lex_remainder(token);
342 const struct libadt_const_lptr
343 doctypedecl = libadt_str_literal("!DOCTYPE");
344
345 if (!_descent_xml_lex_startswith(remainder, doctypedecl)) {
347 return token;
348 }
349
350 token.value.length += doctypedecl.length;
351 return token;
352}
353
354inline struct descent_xml_lex _descent_xml_lex_xmldecl_str(
355 struct descent_xml_lex token
356)
357{
358 struct libadt_const_lptr remainder
359 = _descent_xml_lex_remainder(token);
360 const struct libadt_const_lptr
361 doctypedecl = libadt_str_literal("?xml");
362
363 if (!_descent_xml_lex_startswith(remainder, doctypedecl)) {
365 return token;
366 }
367
368 token.value.length += doctypedecl.length;
369 return token;
370}
371
372
373inline struct descent_xml_lex _descent_xml_lex_doctype_system(
374 struct descent_xml_lex token
375)
376{
377 struct libadt_const_lptr remainder
378 = _descent_xml_lex_remainder(token);
379 const struct libadt_const_lptr
380 systemid = libadt_str_literal("SYSTEM");
381
382 if (!_descent_xml_lex_startswith(remainder, systemid)) {
384 return token;
385 }
386
387 token.value.length += systemid.length;
388 token = descent_xml_lex_then(token, _descent_xml_lex_space);
389 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
390 return token;
391}
392
393inline struct descent_xml_lex _descent_xml_lex_doctype_public(
394 struct descent_xml_lex token
395)
396{
397 struct libadt_const_lptr remainder
398 = _descent_xml_lex_remainder(token);
399 const struct libadt_const_lptr
400 publicid = libadt_str_literal("PUBLIC");
401
402 if (!_descent_xml_lex_startswith(remainder, publicid)) {
404 return token;
405 }
406
407 token.value.length += publicid.length;
408 token = descent_xml_lex_then(token, _descent_xml_lex_space);
409 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
410 token = descent_xml_lex_then(token, _descent_xml_lex_space);
411 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
412 return token;
413}
414
415inline struct descent_xml_lex _descent_xml_lex_doctype_extrawurst(
416 struct descent_xml_lex token
417)
418{
419 token = descent_xml_lex_then(
420 token,
421 _descent_xml_lex_space
422 );
423 if (token.type == descent_xml_classifier_unexpected)
424 return token;
425
426 token = descent_xml_lex_or(
427 token,
428 _descent_xml_lex_doctype_system,
429 _descent_xml_lex_doctype_public
430 );
431 return token;
432}
433
434inline struct descent_xml_lex descent_xml_lex_handle_doctype(
435 struct descent_xml_lex token
436)
437{
438 token = descent_xml_lex_then(token, _descent_xml_lex_doctype_str);
439 token = descent_xml_lex_then(token, _descent_xml_lex_space);
440 token = descent_xml_lex_then(token, _descent_xml_lex_name);
441 token = descent_xml_lex_optional(
442 token,
443 _descent_xml_lex_doctype_extrawurst
444 );
445 token = descent_xml_lex_optional(
446 token,
447 _descent_xml_lex_space
448 );
449
450 if (token.type == descent_xml_classifier_unexpected)
451 return token;
452
453 token.value = libadt_const_lptr_index(token.value, 1);
454 token.type = descent_xml_lex_doctype;
455 return token;
456}
457
458inline struct descent_xml_lex _descent_xml_lex_attribute_value(
459 struct descent_xml_lex token
460)
461{
462 token = descent_xml_lex_then(token, _descent_xml_lex_space);
463 token = descent_xml_lex_then(token, _descent_xml_lex_name);
464 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
465 token = descent_xml_lex_then(token, _descent_xml_lex_assign);
466 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
467 token = descent_xml_lex_then(token, _descent_xml_lex_quote_string);
468 return token;
469}
470
471inline struct descent_xml_lex descent_xml_lex_handle_xmldecl(
472 struct descent_xml_lex token
473)
474{
475 token = descent_xml_lex_then(token, _descent_xml_lex_xmldecl_str);
476 token = descent_xml_lex_then(token, _descent_xml_lex_attribute_value);
477
478 struct descent_xml_lex next = token;
479 while (
480 (next = descent_xml_lex_then(
481 next,
482 _descent_xml_lex_attribute_value
484 ) {
485 token = next;
486 }
487 token = descent_xml_lex_optional(token, _descent_xml_lex_space);
488
489 if (token.type == descent_xml_classifier_unexpected)
490 return token;
491
492 struct libadt_const_lptr remainder = _descent_xml_lex_remainder(token);
493 // TODO: do this properly sometime
494 if (*(char*)remainder.buffer != '?') {
496 return token;
497 } else {
498 token.value.length++;
499 }
500
501 token.value = libadt_const_lptr_index(token.value, 1);
502 token.type = descent_xml_lex_xmldecl;
503 return token;
504}
505
506inline struct descent_xml_lex _descent_xml_lex_handle_prolog(
507 struct descent_xml_lex token
508)
509{
510 return descent_xml_lex_or(
511 token,
512 descent_xml_lex_handle_xmldecl,
513 descent_xml_lex_handle_doctype
514 );
515}
516
517inline struct descent_xml_lex descent_xml_lex_handle_cdata(
518 struct descent_xml_lex token
519)
520{
521 struct libadt_const_lptr remainder
522 = _descent_xml_lex_remainder(token);
523 const struct libadt_const_lptr cdata
524 = libadt_str_literal("![CDATA[");
525 ssize_t total = 0;
526 if (!_descent_xml_lex_startswith(remainder, cdata)) {
528 return token;
529 }
530
531 total += cdata.length;
532 remainder = libadt_const_lptr_index(remainder, cdata.length);
533
534 const struct libadt_const_lptr cdata_end
535 = libadt_str_literal("]]");
536
537 while (!_descent_xml_lex_startswith(remainder, cdata_end)) {
538 if (remainder.length <= 0) {
540 return token;
541 }
542 total++;
543 remainder = libadt_const_lptr_index(remainder, 1);
544 }
545
546 total += cdata_end.length;
547 token.type = descent_xml_lex_cdata;
548 token.value = libadt_const_lptr_index(token.value, 1);
549 token.value.length += total;
550 return token;
551}
552
553inline struct descent_xml_lex descent_xml_lex_handle_comment(
554 struct descent_xml_lex token
555)
556{
557 struct libadt_const_lptr remainder
558 = _descent_xml_lex_remainder(token);
559 const struct libadt_const_lptr comment
560 = libadt_str_literal("!--");
561 ssize_t total = 0;
562 if (!_descent_xml_lex_startswith(remainder, comment)) {
564 return token;
565 }
566
567 total += comment.length;
568 remainder = libadt_const_lptr_index(remainder, comment.length);
569
570 const struct libadt_const_lptr comment_end
571 = libadt_str_literal("--");
572
573 while (!_descent_xml_lex_startswith(remainder, comment_end)) {
574 if (remainder.length <= 0) {
576 return token;
577 }
578 total++;
579 remainder = libadt_const_lptr_index(remainder, 1);
580 }
581
582 total += comment_end.length;
583 token.type = descent_xml_lex_comment;
584 token.value = libadt_const_lptr_index(token.value, 1);
585 token.value.length += total;
586 return token;
587}
588
589inline struct descent_xml_lex _descent_xml_lex_handle_unmarkdown(
590 struct descent_xml_lex token
591)
592{
593 return descent_xml_lex_or(
594 token,
595 descent_xml_lex_handle_comment,
596 descent_xml_lex_handle_cdata
597 );
598}
599
609 struct descent_xml_lex token
610)
611{
612 struct libadt_const_lptr next = _descent_xml_lex_remainder(token);
613
614 if (token.type == descent_xml_classifier_element) {
615 // all this bizarre XML syntax pisses me off so
616 // I'm just beating it into submission
617 struct descent_xml_lex test = descent_xml_lex_or(
618 token,
619 _descent_xml_lex_handle_prolog,
620 _descent_xml_lex_handle_unmarkdown
621 );
623 return test;
624 }
625
626 _descent_xml_lex_read_t
627 read = _descent_xml_lex_read(next, token.type),
628 previous_read = read;
629
630 if (_descent_xml_lex_read_error(read))
631 return (struct descent_xml_lex) {
632 .script = token.script,
634 .value = libadt_const_lptr_truncate(next, 0),
635 };
636
637 if (read.type == descent_xml_classifier_eof) {
638 return (struct descent_xml_lex) {
639 .script = token.script,
640 .type = read.type,
641 .value = libadt_const_lptr_truncate(next, (size_t)read.amount)
642 };
643 }
644
645 ssize_t value_length = read.amount;
646 for (
647 read = _descent_xml_lex_read(read.script, read.type);
648 !_descent_xml_lex_read_error(read);
649 read = _descent_xml_lex_read(read.script, read.type)
650 ) {
651 if (read.type != previous_read.type)
652 break;
653
654 previous_read = read;
655 value_length += read.amount;
656 }
657
658 return (struct descent_xml_lex) {
659 .script = token.script,
660 .type = previous_read.type,
661 .value = libadt_const_lptr_truncate(next, (size_t)value_length),
662 };
663}
664
665#ifdef __cplusplus
666} // extern "C"
667#endif
668
669#endif // DESCENT_XML_LEX
descent_xml_classifier_void_fn * descent_xml_classifier_start(wchar_t input)
void descent_xml_classifier_void_fn(void)
Definition classifier.h:59
descent_xml_classifier_fn *const descent_xml_classifier_unexpected
descent_xml_classifier_void_fn * descent_xml_classifier_fn(wchar_t input)
Definition classifier.h:67
descent_xml_classifier_fn *const descent_xml_classifier_eof
struct descent_xml_lex descent_xml_lex_next_raw(struct descent_xml_lex token)
Returns the next, raw token in the script referred to by previous.
Definition lex.h:608
struct descent_xml_lex descent_xml_lex_init(struct libadt_const_lptr script)
Initializes a token object for use in descent_xml_lex_next().
Definition lex.h:121
Represents a single token.
Definition lex.h:42
struct libadt_const_lptr script
A pointer to the full script.
Definition lex.h:51
struct libadt_const_lptr value
A pointer to the classifiered value.
Definition lex.h:58
descent_xml_classifier_fn * type
Represents the type of token classifiered.
Definition lex.h:46