4
inline bool bit_test(unsigned char * bit_Map, int codepoint) {
5
return (bit_Map[codepoint/8] >> (7 - codepoint % 8)) & 1;
9
bool is_XML10_NameStrt_codepoint(int codepoint) {
10
switch (codepoint >> 12) {
11
case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
12
case 1: if (codepoint <= 0x11FF)
13
return bit_test(NameStrt_XML10_0000_11FF, codepoint);
14
else if (codepoint < 0x1E00) return false;
15
else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
16
case 2: if (codepoint > 0x2182) return false;
17
else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
18
case 3: if (codepoint > 0x312C) return false;
19
else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
20
case 4: return codepoint >= 0x4E00;
21
case 5: case 6: case 7: case 8: return true;
22
case 9: return codepoint <= 0x9FA5;
23
case 0xA: return codepoint >= 0xAC00;
24
case 0xB: case 0xC: return true;
25
case 0xD: return codepoint <= 0xD7A3;
26
default: return false;
30
bool is_XML10_NameChar_codepoint(int codepoint) {
31
switch (codepoint >> 12) {
32
case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
33
case 1: if (codepoint <= 0x11FF)
34
return bit_test(NameChar_XML10_0000_11FF, codepoint);
35
else if (codepoint < 0x1E00) return false;
36
else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
37
case 2: if (codepoint > 0x2182) return false;
38
else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
39
case 3: if (codepoint > 0x312C) return false;
40
else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
41
case 4: return codepoint >= 0x4E00;
42
case 5: case 6: case 7: case 8: return true;
43
case 9: return codepoint <= 0x9FA5;
44
case 0xA: return codepoint >= 0xAC00;
45
case 0xB: case 0xC: return true;
46
case 0xD: return codepoint <= 0xD7A3;
47
default: return false;
51
bool is_XML11_NameStrt_codepoint(int codepoint) {
52
if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
53
else switch (codepoint >> 12) {
54
case 0: case 1: return true;
55
case 2: if (codepoint >= 0x2070)
56
if (codepoint <= 0x218F) return true;
57
else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
58
else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
59
case 3: return codepoint >= 0x3001;
60
case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
61
case 0xD: return codepoint <= 0xD7FF;
62
case 0xE: return false;
63
case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
64
else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
65
default: return codepoint <= 0xEFFFF;
69
bool is_XML11_NameChar_codepoint(int codepoint) {
70
if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
71
else switch (codepoint >> 12) {
72
case 0: case 1: return true;
73
case 2: if (codepoint >= 0x2070)
74
if (codepoint <= 0x218F) return true;
75
else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
76
else if (codepoint <= 0x200D) return codepoint >= 0x200C;
77
else return (codepoint == 0x203F) | (codepoint == 0x2040);
78
case 3: return codepoint >= 0x3001;
79
case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
80
case 0xD: return codepoint <= 0xD7FF;
81
case 0xE: return false;
82
case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
83
else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
84
default: return codepoint <= 0xEFFFF;
88
inline int XML_10_UTF8_NameStrt_bytes (unsigned char bytes[]) {
89
if (bytes[0] <= 0x7F) {
90
if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
93
else if (bytes[0] <= 0xDF) {
94
int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
95
if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
98
else if (bytes[0] <= 0xEF) {
99
int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
100
return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
105
inline int XML_10_UTF8_NameChar_bytes (unsigned char bytes[]) {
106
if (bytes[0] <= 0x7F) {
107
if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
110
else if (bytes[0] <= 0xDF) {
111
int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
112
if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
115
else if (bytes[0] <= 0xEF) {
116
int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
117
return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
122
inline int XML_11_UTF8_NameStrt_bytes (unsigned char bytes[]) {
123
if (bytes[0] <= 0x7F) {
124
if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
127
else if (bytes[0] <= 0xDF) {
128
int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
129
return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
131
else if (bytes[0] <= 0xEF) {
132
int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
133
return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
136
int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
137
((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
138
return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
142
inline int XML_11_UTF8_NameChar_bytes (unsigned char bytes[]) {
143
if (bytes[0] <= 0x7F) {
144
if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
147
else if (bytes[0] <= 0xDF) {
148
int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
149
return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
151
else if (bytes[0] <= 0xEF) {
152
int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
153
return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
156
int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
157
((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
158
return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
162
bool is_XML10_UTF8_Name(char protoname[], int lgth) {
163
int valid_bytes = XML_10_UTF8_NameStrt_bytes((unsigned char *) protoname);
164
int pos = valid_bytes;
165
while ((valid_bytes > 0) & (pos < lgth)) {
166
valid_bytes = XML_10_UTF8_NameChar_bytes((unsigned char *) &protoname[pos]);
170
/* Success requires that every byte sequence processed be valid
171
and that the total lgth processed be exactly that provided on
174
return (valid_bytes > 0) & (pos == lgth);
177
bool is_XML11_UTF8_Name(char protoname[], int lgth) {
178
int valid_bytes = XML_11_UTF8_NameStrt_bytes((unsigned char *) protoname);
179
int pos = valid_bytes;
180
while ((valid_bytes > 0) & (pos < lgth)) {
181
valid_bytes = XML_11_UTF8_NameChar_bytes((unsigned char *) &protoname[pos]);
184
/* Success requires that every byte sequence processed be valid
185
and that the total lgth processed be exactly that provided on
187
return (valid_bytes > 0) & (pos == lgth);
190
bool is_XML10_UTF8_Nmtoken(char prototoken[], int lgth) {
191
int valid_bytes = XML_10_UTF8_NameChar_bytes((unsigned char *) prototoken);
192
int pos = valid_bytes;
193
while ((valid_bytes > 0) & (pos < lgth)) {
194
valid_bytes = XML_10_UTF8_NameChar_bytes((unsigned char *) &prototoken[pos]);
198
/* Success requires that every byte sequence processed be valid
199
and that the total lgth processed be exactly that provided on
202
return (valid_bytes > 0) & (pos == lgth);
205
bool is_XML11_UTF8_Nmtoken(char prototoken[], int lgth) {
206
int valid_bytes = XML_11_UTF8_NameChar_bytes((unsigned char *) prototoken);
207
int pos = valid_bytes;
208
while ((valid_bytes > 0) & (pos < lgth)) {
209
valid_bytes = XML_11_UTF8_NameChar_bytes((unsigned char *) &prototoken[pos]);
212
/* Success requires that every byte sequence processed be valid
213
and that the total lgth processed be exactly that provided on
215
return (valid_bytes > 0) & (pos == lgth);
218
int Symbol_Table::Insert_Name(const char * name, int lgth) {
219
// char * s = copy_name(name,lgth);
220
char * s = pool.Insert(name,lgth);
221
UTF8NameMap[s]=++(globalNameCount);
223
name_data.name_string = s;
224
name_data.lgth = lgth;
225
UTF8NameTable.push_back(name_data);
226
return globalNameCount;
230
inline bool Verify_ASCII(char * name_ptr, int name_lgth) {
231
/* To verify that a name is ASCII, ensure that the high bit
232
of each byte is 0. A SIMD compare can verify this for
233
up to sizeof(BytePack) bytes. For less than 16 bytes,
234
first shift out bytes beyond the name length. For more
235
than 16 bytes, form the logical "or" of the successive byte
236
packs together so that a high 1 bit in any byte is preserved
237
for the final SIMD test. */
238
BytePack b = sisd_load_unaligned((BytePack *) name_ptr);
239
if (name_lgth <= sizeof(BytePack)) {
240
/* Clear bytes beyond the length of the name. */
241
b = sisd_sfl(b, sisd_from_int(8 * (sizeof(BytePack) - name_lgth)));
244
int offset = name_lgth % sizeof(BytePack);
245
for (int i = offset; i < name_lgth; i += sizeof(BytePack)) {
246
b = simd_or(sisd_load_unaligned((BytePack *) &name_ptr[i]),b);
249
#ifdef TEMPLATED_SIMD_LIB
250
return !simd_any_sign_bit<8>(b);
252
#ifndef TEMPLATED_SIMD_LIB
253
return !simd_any_sign_bit_8(b);
258
/* ASCII_LookupOrInsert determines the nameID for any ASCII name
259
from the global name table, inserting the name and allocating
260
a nameID if necessary. If the name is non-ASCII, 0 is returned. */
262
inline int Symbol_Table::ASCII_Lookup_or_Insert_Name(char * name_ptr, int name_lgth) {
264
if (Verify_ASCII(name_ptr, name_lgth)) {
265
return UTF8_Lookup_or_Insert_Name(name_ptr, name_lgth);
272
int Symbol_Table::UTF8_Lookup_or_Insert_Name(char * name, int lgth) {
274
char delim = name[lgth];
276
int nameID = UTF8NameMap[name];
280
#if (not defined(OMISSION)) or (OMISSION != NAME_VALIDATION)
282
if (!is_XML11_UTF8_Name(name,lgth)) {
283
ShowSyntaxError(NT_Name);
288
if (!is_XML10_UTF8_Name(name,lgth)) {
289
if (version == XML_1_1) {
290
if (!is_XML11_UTF8_Name(name,lgth)) {
291
ShowSyntaxError(NT_Name);
296
ShowSyntaxError(NT_Name);
302
// char * s = copy_name(name,lgth);
303
char * s = pool.Insert(name,lgth);
304
UTF8NameMap[s]=++(globalNameCount);
305
nameID = globalNameCount;
307
name_data.name_string = s;
308
name_data.lgth = lgth;
309
UTF8NameTable.push_back(name_data);
310
// UTF8NameTable.push_back(s);
315
//char * Symbol_Table::Get_UTF8_name(int nameID) {
316
// return UTF8NameTable[nameID];
319
char * Symbol_Table::Get_UTF8_name(int nameID) {
320
return UTF8NameTable[nameID].name_string;
323
int Symbol_Table::Get_UTF8_lgth(int nameID) {
324
return UTF8NameTable[nameID].lgth;
327
inline int Symbol_Table::ASCII_Lookup_or_Insert_Nmtoken(char * nmtoken_ptr, int name_lgth) {
329
if (Verify_ASCII(nmtoken_ptr, name_lgth)) {
330
return UTF8_Lookup_or_Insert_Nmtoken(nmtoken_ptr, name_lgth);
336
int Symbol_Table::UTF8_Lookup_or_Insert_Nmtoken(char * nmtoken, int lgth) {
338
char delim = nmtoken[lgth];
339
nmtoken[lgth] = '\0';
340
int nmtokenID = UTF8NmtokenMap[nmtoken];
341
nmtoken[lgth] = delim;
344
#if (not defined(OMISSION)) or (OMISSION != NAME_VALIDATION)
346
if (!is_XML11_UTF8_Nmtoken(nmtoken,lgth)) {
347
ShowSyntaxError(NT_Nmtoken);
352
if (!is_XML10_UTF8_Nmtoken(nmtoken,lgth)) {
353
if (version == XML_1_1) {
354
if (!is_XML11_UTF8_Nmtoken(nmtoken,lgth)) {
355
ShowSyntaxError(NT_Nmtoken);
360
ShowSyntaxError(NT_Nmtoken);
366
// char * s = copy_name(name,lgth);
367
char * s = pool.Insert(nmtoken,lgth);
368
UTF8NmtokenMap[s]=++(globalNmtokenCount);
369
nmtokenID = globalNmtokenCount;
370
Name_Data nmtoken_data;
371
nmtoken_data.name_string = s;
372
nmtoken_data.lgth = lgth;
373
UTF8NmtokenTable.push_back(nmtoken_data);
374
// UTF8NameTable.push_back(s);
382
char * Symbol_Table::Get_UTF8_nmtoken(int nmtokenID) {
383
return UTF8NmtokenTable[nmtokenID].name_string;
386
int Symbol_Table::Get_UTF8_nmtoken_lgth(int nmtokenID) {
387
return UTF8NmtokenTable[nmtokenID].lgth;
390
char * Symbol_Table::ReserveSymbolSpace(int u8_lgth) {
391
reserved = new char[u8_lgth+1];
392
reserved_lgth = u8_lgth;
396
int Symbol_Table::LookupOrInsertReserved(){
397
int nameID = UTF8NameMap[reserved];
399
#if (not defined(OMISSION)) or (OMISSION != NAME_VALIDATION)
400
if (!is_XML10_UTF8_Name(reserved,reserved_lgth)) {
401
ShowSyntaxError(NT_Name);
405
UTF8NameMap[reserved]=++(globalNameCount);
406
nameID = globalNameCount;
408
name_data.name_string = reserved;
409
name_data.lgth = reserved_lgth;
410
UTF8NameTable.push_back(name_data);
411
// UTF8NameTable.push_back(s);
419
int Symbol_Table::LookupOrInsertReserved_nmtoken(){
420
int nmtokenID = UTF8NmtokenMap[reserved];
422
#if (not defined(OMISSION)) or (OMISSION != NAME_VALIDATION)
423
if (!is_XML10_UTF8_Nmtoken(reserved,reserved_lgth)) {
424
ShowSyntaxError(NT_Nmtoken);
428
UTF8NmtokenMap[reserved]=++(globalNmtokenCount);
429
nmtokenID = globalNmtokenCount;
430
Name_Data nmtoken_data;
431
nmtoken_data.name_string = reserved;
432
nmtoken_data.lgth = reserved_lgth;
433
UTF8NmtokenTable.push_back(nmtoken_data);
441
Symbol_Table::Symbol_Table(){
444
name_data.name_string = NULL;
446
UTF8NameTable.push_back(name_data);
447
// UTF8NameTable.push_back(NULL);
448
/* for (int i = 0; i < 5; i++) {
449
UTF8NameMap[predefined[i]] = ++(globalNameCount);
451
name_data.name_string = predefined[i];
452
name_data.lgth = strlen(predefined[i]);
453
UTF8NameTable.push_back(name_data);
454
printf("predefined name: %s, global name count: %d\n",predefined[i],globalNameCount);
455
// UTF8NameTable.push_back(predefined[i]);