1
/* bytelex.h - XML lexical recognizers for pseudo-ASCII or
2
EBCDIC-family byte streams
3
Copyright (c) 2008, Robert D. Cameron.
4
Licensed to the public under the Open Software License 3.0.
5
Licensed to International Characters, Inc., under the Academic
12
#include "multiliteral.h"
14
template<CodeUnit_Base C, unsigned char c>
15
inline bool AtChar(unsigned char x8data[]) {
16
return x8data[0] == Ord<C, c>::value;
19
template<CodeUnit_Base C>
20
inline bool AtQuote(unsigned char x8data[]) {
21
return (x8data[0] == Ord<C, '"'>::value) | (x8data[0] == Ord<C, '\''>::value);
24
/* In both ASCII and EBCDIC, digits are in a contiguous range
26
template<CodeUnit_Base C>
27
inline bool at_Digit(unsigned char x8data[]) {
28
return (x8data[0] >= Ord<C, '0'>::value) & (x8data[0] <= Ord<C, '9'>::value);
31
template<CodeUnit_Base C>
32
inline unsigned int DigitVal(unsigned char d) {
33
return d - Ord<C, '0'>::value;
36
/* In both ASCII and EBCDIC, digits are in a contiguous range
37
from 0 through 9. Similarly the hex characters A through F
38
and a through f are also in contiguous ranges that differ in
39
only one bit position (Ord<C, 'a'>::value ^ Ord<C, 'A'>::value).*/
40
template<CodeUnit_Base C>
41
inline bool at_HexDigit(unsigned char x8data[]) {
42
const unsigned char case_bit = Ord<C, 'a'>::value ^ Ord<C, 'A'>::value;
43
const unsigned char canon_A = Ord<C, 'a'>::value | case_bit;
44
const unsigned char canon_F = Ord<C, 'f'>::value | case_bit;
45
unsigned char ch = x8data[0];
46
unsigned char canon_ch = ch | case_bit;
47
return ((ch >= Ord<C, '0'>::value) & (ch <= Ord<C, '9'>::value)) |
48
((canon_ch >= canon_A) & (canon_ch <= canon_F));
51
template<CodeUnit_Base C>
52
inline unsigned int HexVal(unsigned char ch) {
53
const unsigned char case_bit = Ord<C, 'a'>::value ^ Ord<C, 'A'>::value;
54
const unsigned char canon_A = Ord<C, 'a'>::value | case_bit;
55
unsigned char canon_ch = ch | case_bit;
56
if ((ch >= Ord<C, '0'>::value) & (ch <= Ord<C, '9'>::value)) return ch - Ord<C, '0'>::value;
57
else return (ch | case_bit) - canon_A + 10;
60
// Whitespace recognition. This varies between XML 1.0 and
61
// XML 1.1, but only the XML 1.0 version is needed.
63
template<CodeUnit_Base C>
64
inline bool at_WhiteSpace_10(unsigned char x8data[]) {
65
unsigned char ch = x8data[0];
66
return (ch == Ord<C, ' '>::value) ||
67
(ch == CR<C>::value) || (ch == LF<C>::value) || (ch == HT<C>::value);
72
template<CodeUnit_Base C>
73
inline bool at_EndTag_Start(unsigned char x8data[]) {
74
return s2int16(x8data) == c2int16<C, '<', '/'>::value;
77
template<CodeUnit_Base C>
78
inline bool at_Comment_Start(unsigned char x8data[]) {
79
return s4int32(x8data) == c4int32<C, '<', '!', '-', '-'>::value;
82
template<CodeUnit_Base C>
83
inline bool at_DoubleHyphen(unsigned char x8data[]) {
84
return s2int16(x8data) == c2int16<C, '-', '-'>::value;
87
template<CodeUnit_Base C>
88
inline bool at_Comment_End(unsigned char x8data[]) {
89
return s3int32(x8data) == c3int32<C, '-', '-', '>'>::value;
92
template<CodeUnit_Base C>
93
inline bool at_CDATA_Start(unsigned char x8data[]) {
94
return s8int64(x8data) ==
95
c8int64<C, '<', '!', '[', 'C', 'D', 'A', 'T', 'A'>::value;
98
template<CodeUnit_Base C>
99
inline bool at_CDATA_End(unsigned char x8data[]) {
100
return s3int32(x8data) == c3int32<C, ']', ']', '>'>::value;
103
template<CodeUnit_Base C>
104
inline bool at_PI_Start(unsigned char x8data[]) {
105
return s2int16(x8data) == c2int16<C, '<', '?'>::value;
108
template<CodeUnit_Base C>
109
inline bool at_PI_End(unsigned char x8data[]) {
110
return s2int16(x8data) == c2int16<C, '?', '>'>::value;
113
template<CodeUnit_Base C>
114
inline bool at_CharRef_Start(unsigned char x8data[]) {
115
return s2int16(x8data) == c2int16<C, '&', '#'>::value;
119
template<CodeUnit_Base C>
120
inline bool at_EqualsQuote(unsigned char x8data[]) {
121
uint16_t EQ = s2int16(x8data);
122
return (EQ == c2int16<C, '=', '"'>::value) | (EQ == c2int16<C, '=', '\''>::value);
125
template<CodeUnit_Base C>
126
inline bool at_xmlns(unsigned char x8data[]) {
127
return s5int64(x8data) == c5int64<C, 'x', 'm', 'l', 'n', 's'>::value;
130
template<CodeUnit_Base C>
131
inline bool at_EmptyElementDelim(unsigned char x8data[]) {
132
return s2int16(x8data) == c2int16<C, '/', '>'>::value;
135
template<CodeUnit_Base C>
136
inline bool at_XmlDecl_start(unsigned char x8data[]) {
137
return (s5int64(x8data) == c5int64<C, '<', '?', 'x', 'm', 'l'>::value) &&
138
at_WhiteSpace_10<C>(&x8data[5]);
141
template<CodeUnit_Base C>
142
inline bool at_version(unsigned char x8data[]) {
143
return s7int64(x8data) == c7int64<C, 'v', 'e', 'r', 's', 'i', 'o', 'n'>::value;
146
template<CodeUnit_Base C>
147
inline bool at_1_0(unsigned char x8data[]) {
148
return (s5int64(x8data) == c5int64<C, '"', '1', '.', '0', '"'>::value) ||
149
(s5int64(x8data) == c5int64<C, '\'', '1', '.', '0', '\''>::value);
152
template<CodeUnit_Base C>
153
inline bool at_1_1(unsigned char x8data[]) {
154
return (s5int64(x8data) == c5int64<C, '"', '1', '.', '1', '"'>::value) ||
155
(s5int64(x8data) == c5int64<C, '\'', '1', '.', '1', '\''>::value);
158
template<CodeUnit_Base C>
159
inline bool at_encoding(unsigned char x8data[]) {
160
return s8int64(x8data) == c8int64<C, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g'>::value;
163
template<CodeUnit_Base C>
164
inline bool at_standalone(unsigned char x8data[]) {
165
return (s8int64(x8data) == c8int64<C, 's', 't', 'a', 'n', 'd', 'a', 'l', 'o'>::value) &
166
(s2int16(&x8data[8]) == c2int16<C, 'n', 'e'>::value);
169
template<CodeUnit_Base C>
170
inline bool at_yes(unsigned char x8data[]) {
171
return (s5int64(x8data) == c5int64<C, '"', 'y', 'e', 's', '"'>::value) |
172
(s5int64(x8data) == c5int64<C, '\'', 'y', 'e', 's', '\''>::value);
175
template<CodeUnit_Base C>
176
inline bool at_no(unsigned char x8data[]) {
177
return (s4int32(x8data) == c4int32<C, '"', 'n', 'o', '"'>::value) |
178
(s4int32(x8data) == c4int32<C, '\'', 'n', 'o', '\''>::value);
181
template<CodeUnit_Base C>
182
inline bool at_XxMmLll(unsigned char x8data[]) {
183
return caseless_comp<C, 'x', 'm', 'l'>(x8data);
186
/* The at_ElementTag_Start recognizer rules out '<!', '<?', '</'
187
combinations while returning true for '<' followed by any NameStrt
190
template<CodeUnit_Base C>
191
inline bool at_ElementTag_Start(unsigned char x8data[]) {
192
return (x8data[0] == Ord<C, '<'>::value) & (x8data[1] != Ord<C, '!'>::value) &
193
(x8data[1] != Ord<C, '?'>::value) & (x8data[1] != Ord<C, '/'>::value);
196
/* The following ugly hack optimizes for ASCII. */
198
inline bool at_ElementTag_Start<ASCII>(unsigned char x8data[]) {
199
return (x8data[0] == Ord<ASCII, '<'>::value) &
200
((x8data[1] & 0xE1) != 0x21);
204
inline bool at_UTF_8(unsigned char x8data[]) {
205
return caseless_comp<ASCII, 'u', 't', 'f', '-', '8'>(x8data);
208
inline bool at_UCS_2(unsigned char x8data[]) {
209
return caseless_comp<ASCII, 'u', 'c', 's', '-', '2'>(x8data);
212
inline bool at_UCS_4(unsigned char x8data[]) {
213
return caseless_comp<ASCII, 'u', 'c', 's', '-', '4'>(x8data);
216
inline bool at_UCS_2LE(unsigned char x8data[]) {
217
return caseless_comp<ASCII, 'u', 'c', 's', '-', '2', 'l', 'e'>(x8data);
220
inline bool at_UCS_2BE(unsigned char x8data[]) {
221
return caseless_comp<ASCII, 'u', 'c', 's', '-', '2', 'b', 'e'>(x8data);
224
inline bool at_UCS_4LE(unsigned char x8data[]) {
225
return caseless_comp<ASCII, 'u', 'c', 's', '-', '4', 'l', 'e'>(x8data);
228
inline bool at_UCS_4BE(unsigned char x8data[]) {
229
return caseless_comp<ASCII, 'u', 'c', 's', '-', '4', 'b', 'e'>(x8data);
232
inline bool at_UTF_16(unsigned char x8data[]) {
233
return caseless_comp<ASCII, 'u', 't', 'f', '-', '1', '6'>(x8data);
236
inline bool at_UTF_32(unsigned char x8data[]) {
237
return caseless_comp<ASCII, 'u', 't', 'f', '-', '3', '2'>(x8data);
240
inline bool at_UTF_16LE(unsigned char x8data[]) {
241
return caseless_comp<ASCII, 'u', 't', 'f', '-', '1', '6', 'l', 'e'>(x8data);
244
inline bool at_UTF_32LE(unsigned char x8data[]) {
245
return caseless_comp<ASCII, 'u', 't', 'f', '-', '3', '2', 'l', 'e'>(x8data);
248
inline bool at_UTF_16BE(unsigned char x8data[]) {
249
return caseless_comp<ASCII, 'u', 't', 'f', '-', '1', '6', 'b', 'e'>(x8data);
252
inline bool at_UTF_32BE(unsigned char x8data[]) {
253
return caseless_comp<ASCII, 'u', 't', 'f', '-', '3', '2', 'b', 'e'>(x8data);
256
inline bool at_ASCII(unsigned char x8data[]) {
257
return caseless_comp<ASCII, 'a', 's', 'c', 'i', 'i'>(x8data);
260
inline bool at_Latin1(unsigned char x8data[]) {
261
return caseless_comp<ASCII, 'l', 'a', 't', 'i', 'n', '1'>(x8data);
264
inline bool at_EBCDIC(unsigned char x8data[]) {
265
return caseless_comp<EBCDIC, 'e', 'b', 'c', 'd', 'i', 'c'>(x8data);
268
template<CodeUnit_Base C>
269
inline bool at_DOCTYPE_start(unsigned char x8data[]) {
270
return s8int64(x8data) == c8int64<C, '<', '!','D', 'O', 'C', 'T', 'Y', 'P'>::value & AtChar<C,'E'>(&x8data[8]);
273
template<CodeUnit_Base C>
274
inline bool at_SYSTEM(unsigned char x8data[]) {
275
return s6int64(x8data) == c6int64<C, 'S', 'Y', 'S', 'T', 'E', 'M'>::value;
278
template<CodeUnit_Base C>
279
inline bool at_PUBLIC(unsigned char x8data[]) {
280
return s6int64(x8data) == c6int64<C, 'P', 'U', 'B', 'L', 'I', 'C'>::value;
283
template<CodeUnit_Base C>
284
inline bool at_ELEMENT(unsigned char x8data[]) {
285
return s7int64(x8data) == c7int64<C, 'E', 'L', 'E', 'M', 'E', 'N', 'T'>::value;
288
template<CodeUnit_Base C>
289
inline bool at_ATTLIST(unsigned char x8data[]) {
290
return s7int64(x8data) == c7int64<C, 'A', 'T', 'T', 'L', 'I', 'S', 'T'>::value;
293
template<CodeUnit_Base C>
294
inline bool at_ENTITY(unsigned char x8data[]) {
295
return s6int64(x8data) == c6int64<C, 'E', 'N', 'T', 'I', 'T', 'Y'>::value;
298
template<CodeUnit_Base C>
299
inline bool at_NOTATION(unsigned char x8data[]) {
300
return s8int64(x8data) == c8int64<C, 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N'>::value;
303
template<CodeUnit_Base C>
304
inline bool at_EMPTY(unsigned char x8data[]) {
305
return s5int64(x8data) == c5int64<C, 'E', 'M', 'P', 'T', 'Y'>::value;
308
template<CodeUnit_Base C>
309
inline bool at_PCDATA(unsigned char x8data[]) {
310
return s7int64(x8data) == c7int64<C, '#', 'P', 'C', 'D', 'A', 'T', 'A'>::value;
313
template<CodeUnit_Base C>
314
inline bool at_Para_star(unsigned char x8data[]) {
315
return s2int16(x8data) == c2int16<C, ')', '*'>::value;
318
template<CodeUnit_Base C>
319
inline bool at_CDATA(unsigned char x8data[]) {
320
return s5int64(x8data) == c5int64<C, 'C', 'D', 'A', 'T', 'A'>::value;
323
template<CodeUnit_Base C>
324
inline bool at_ID(unsigned char x8data[]) {
325
return s2int16(x8data) == c2int16<C, 'I', 'D'>::value;
328
template<CodeUnit_Base C>
329
inline bool at_IDREF(unsigned char x8data[]) {
330
return s5int64(x8data) == c5int64<C, 'I', 'D', 'R', 'E', 'F'>::value;
333
template<CodeUnit_Base C>
334
inline bool at_NDATA(unsigned char x8data[]) {
335
return s5int64(x8data) == c5int64<C, 'N', 'D', 'A', 'T', 'A'>::value;
338
template<CodeUnit_Base C>
339
inline bool at_IDREFS(unsigned char x8data[]) {
340
return s6int64(x8data) == c6int64<C, 'I', 'D', 'R', 'E', 'F', 'S'>::value;
343
template<CodeUnit_Base C>
344
inline bool at_ENTITIES(unsigned char x8data[]) {
345
return s8int64(x8data) == c8int64<C, 'E', 'N', 'T', 'I', 'T', 'I', 'E', 'S'>::value;
348
template<CodeUnit_Base C>
349
inline bool at_NMTOKEN(unsigned char x8data[]) {
350
return s7int64(x8data) == c7int64<C, 'N', 'M', 'T', 'O', 'K', 'E', 'N'>::value;
353
template<CodeUnit_Base C>
354
inline bool at_NMTOKENS(unsigned char x8data[]) {
355
return s8int64(x8data) == c8int64<C, 'N', 'M', 'T', 'O', 'K', 'E', 'N', 'S'>::value;
358
template<CodeUnit_Base C>
359
inline bool at_REQUIRED(unsigned char x8data[]) {
360
return s8int64(x8data) == c8int64<C, '#', 'R', 'E', 'Q', 'U', 'I', 'R', 'E'>::value
361
& AtChar<C,'D'>(&x8data[8]);
364
template<CodeUnit_Base C>
365
inline bool at_IMPLIED(unsigned char x8data[]) {
366
return s8int64(x8data) == c8int64<C, '#', 'I', 'M', 'P', 'L', 'I', 'E', 'D'>::value;
369
template<CodeUnit_Base C>
370
inline bool at_FIXED(unsigned char x8data[]) {
371
return s6int64(x8data) == c6int64<C, '#', 'F', 'I', 'X', 'E', 'D'>::value;
374
template<CodeUnit_Base C>
375
inline bool at_ANY(unsigned char x8data[]) {
376
return s3int32(x8data) == c3int32<C, 'A', 'N', 'Y'>::value;
379
template<CodeUnit_Base C>
380
inline bool at_INCLUDE(unsigned char x8data[]) {
381
return s7int64(x8data) == c7int64<C, 'I', 'N', 'C', 'L', 'U', 'D', 'E'>::value;
384
template<CodeUnit_Base C>
385
inline bool at_IGNORE(unsigned char x8data[]) {
386
return s6int64(x8data) == c6int64<C, 'I', 'G', 'N', 'O', 'R', 'E'>::value;
389
template<CodeUnit_Base C>
390
inline bool at_condSect_start(unsigned char x8data[]) {
391
return s3int32(x8data) == c3int32<C, '<', '!', '['>::value;
394
template<CodeUnit_Base C>
395
inline bool at_xml(unsigned char x8data[]) {
396
return (s4int32(x8data) == c4int32<C, '?', 'x', 'm', 'l'>::value);
399
template<CodeUnit_Base C>
400
inline bool at_PubidChar(unsigned char x8data[]) {
402
case Ord<C, '0'>::value: case Ord<C, '1'>::value:
403
case Ord<C, '2'>::value: case Ord<C, '3'>::value:
404
case Ord<C, '4'>::value: case Ord<C, '5'>::value:
405
case Ord<C, '6'>::value: case Ord<C, '7'>::value:
406
case Ord<C, '8'>::value: case Ord<C, '9'>::value:
407
case Ord<C, 'A'>::value: case Ord<C, 'a'>::value:
408
case Ord<C, 'B'>::value: case Ord<C, 'b'>::value:
409
case Ord<C, 'C'>::value: case Ord<C, 'c'>::value:
410
case Ord<C, 'D'>::value: case Ord<C, 'd'>::value:
411
case Ord<C, 'E'>::value: case Ord<C, 'e'>::value:
412
case Ord<C, 'F'>::value: case Ord<C, 'f'>::value:
413
case Ord<C, 'G'>::value: case Ord<C, 'g'>::value:
414
case Ord<C, 'H'>::value: case Ord<C, 'h'>::value:
415
case Ord<C, 'I'>::value: case Ord<C, 'i'>::value:
416
case Ord<C, 'J'>::value: case Ord<C, 'j'>::value:
417
case Ord<C, 'K'>::value: case Ord<C, 'k'>::value:
418
case Ord<C, 'L'>::value: case Ord<C, 'l'>::value:
419
case Ord<C, 'M'>::value: case Ord<C, 'm'>::value:
420
case Ord<C, 'N'>::value: case Ord<C, 'n'>::value:
421
case Ord<C, 'O'>::value: case Ord<C, 'o'>::value:
422
case Ord<C, 'P'>::value: case Ord<C, 'p'>::value:
423
case Ord<C, 'Q'>::value: case Ord<C, 'q'>::value:
424
case Ord<C, 'R'>::value: case Ord<C, 'r'>::value:
425
case Ord<C, 'S'>::value: case Ord<C, 's'>::value:
426
case Ord<C, 'T'>::value: case Ord<C, 't'>::value:
427
case Ord<C, 'U'>::value: case Ord<C, 'u'>::value:
428
case Ord<C, 'V'>::value: case Ord<C, 'v'>::value:
429
case Ord<C, 'W'>::value: case Ord<C, 'w'>::value:
430
case Ord<C, 'X'>::value: case Ord<C, 'x'>::value:
431
case Ord<C, 'Y'>::value: case Ord<C, 'y'>::value:
432
case Ord<C, 'Z'>::value: case Ord<C, 'z'>::value:
433
case Ord<C, '-'>::value: case Ord<C, '\''>::value:
434
case Ord<C, '('>::value: case Ord<C, ')'>::value:
435
case Ord<C, '+'>::value: case Ord<C, ','>::value:
436
case Ord<C, '.'>::value: case Ord<C, '/'>::value:
437
case Ord<C, ':'>::value: case Ord<C, '='>::value:
438
case Ord<C, '?'>::value: case Ord<C, ';'>::value:
439
case Ord<C, '!'>::value: case Ord<C, '*'>::value:
440
case Ord<C, '#'>::value: case Ord<C, '@'>::value:
441
case Ord<C, '$'>::value: case Ord<C, '_'>::value:
442
case Ord<C, '%'>::value: case Ord<C, ' '>::value:
443
case CR<C>::value: case LF<C>::value:
445
default: return false;