1
/* engine.c - Parabix XML parsing engine.
2
Copyright (c) 2007, 2008, Robert D. Cameron and Dan Lin.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
13
#include "contentmodel.h"
14
#include "contentmodel.c"
15
#include "xml_error.h"
25
inline char * copy_string (unsigned char * s, int lgth){
26
char * d = new char[lgth+1];
27
memcpy(d, (char *)s,lgth);
32
inline char * cat_string (char * s1, char * s2, int lgth1, int lgth2){
33
char * s = new char[lgth1 + lgth2 + 1];
35
memcpy(&s[lgth1],s2,lgth2);
36
s[lgth1 + lgth2] = '\0';
41
template <WorkingCharacterSet W>
42
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename) {
45
unsigned char signature[4];
47
infile = fopen(filename, "rb");
49
fprintf(stderr, "Error: cannot open %s for input.\n", filename);
52
fread(signature,1,4,infile);
53
Entity_Info * e = new Entity_Info;
54
Model_Info * m = new Model_Info;
55
e->AnalyzeSignature(signature);
56
Byteplex * b = Byteplex::ByteplexFactory(e, infile);
57
b->InitializeBuffer(signature,4);
59
b->PreparePseudoASCII_Stream();
61
if (e->code_unit_base == ASCII) {
62
XML_Decl_Parser<ASCII> decl_parser(b);
63
decl_parser.ReadXMLInfo(*e);
64
if (e->code_unit_size == SingleByte) {
65
if (!(e->has_encoding_decl) || at_UTF_8(e->encoding))
66
return new ParsingEngine< UTF8_Buffer, W>(e, m, b, false);
67
else return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
69
else if (e->code_unit_size == DoubleByte) {
70
return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
72
else if (e->code_unit_size == QuadByte) {
73
return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
76
else /* if (e->code_unit_base == EBCDIC) */ {
77
XML_Decl_Parser<EBCDIC> decl_parser(b);
78
decl_parser.ReadXMLInfo(*e);
79
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
83
template <WorkingCharacterSet W>
84
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename, Model_Info * m) {
87
unsigned char signature[4];
89
infile = fopen(filename, "rb");
91
fprintf(stderr, "Error: cannot open %s for input.\n", filename);
94
fread(signature,1,4,infile);
95
Entity_Info * e = new Entity_Info;
96
e->AnalyzeSignature(signature);
97
Byteplex * b = Byteplex::ByteplexFactory(e, infile);
98
b->InitializeBuffer(signature,4);
100
b->PreparePseudoASCII_Stream();
101
if (e->code_unit_base == ASCII) {
102
XML_Decl_Parser<ASCII> decl_parser(b);
103
decl_parser.ReadXMLInfo(*e);
104
if (e->code_unit_size == SingleByte) {
105
return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, true);
107
else if (e->code_unit_size == DoubleByte) {
108
return new ParsingEngine<U16_Buffer, W>(e, m, b, true);
110
else if (e->code_unit_size == QuadByte) {
111
return new ParsingEngine<U32_Buffer, W>(e, m, b, true);
114
else /* if (e->code_unit_base == EBCDIC) */ {
115
XML_Decl_Parser<EBCDIC> decl_parser(b);
116
decl_parser.ReadXMLInfo(*e);
117
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, true);
121
template <WorkingCharacterSet W>
122
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * byte_buffer, int byte_count, Entity_Info * e1, Model_Info * m){
123
Entity_Info * e = new Entity_Info;
125
e->code_unit_base=e1->code_unit_base;
126
e->code_unit_size=e1->code_unit_size;
127
e->version=e1->version;
128
e->encoding=e1->encoding;
129
e->content_start = 0;
130
Byteplex * b = Byteplex::ByteplexFactory(e, (unsigned char *) byte_buffer, byte_count);
132
b->PreparePseudoASCII_Stream();
133
if (e->code_unit_base == ASCII) {
134
XML_Decl_Parser<ASCII> decl_parser(b);
135
decl_parser.ReadXMLInfo(*e);
136
if (e->code_unit_size == SingleByte) {
138
return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
140
else if (e->code_unit_size == DoubleByte) {
141
return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
143
else if (e->code_unit_size == QuadByte) {
144
return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
147
else /* if (e->code_unit_base == EBCDIC) */ {
148
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
152
template <WorkingCharacterSet W>
153
Parser_Interface<W>::~Parser_Interface() {
157
template <WorkingCharacterSet W>
158
bool Parser_Interface<W>::has_ByteOrderMark() {
159
return entity_Info->BOM_units > 0;
162
template <WorkingCharacterSet W>
163
XML_version Parser_Interface<W>::get_version() {
164
return entity_Info->version;
167
template <WorkingCharacterSet W>
168
XML_standalone Parser_Interface<W>::standalone_status() {
169
return entity_Info->standalone;
172
template <WorkingCharacterSet W>
173
bool Parser_Interface<W>::has_EncodingDecl() {
174
return entity_Info->has_encoding_decl;
177
template <WorkingCharacterSet W>
178
unsigned char * Parser_Interface<W>::get_Encoding() {
179
return entity_Info->encoding;
182
template <class B, WorkingCharacterSet W>
183
inline unsigned char * ParsingEngine<B, W>::GetCodeUnitPtr(int pos) {
184
int rel_pos = pos - buffer_base_pos;
185
return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) B::Size];
189
inline unsigned char * ParsingEngine<UTF8_Buffer, UTF_8>::GetCodeUnitPtr(int pos) {
190
int rel_pos = pos - buffer_base_pos;
191
return &((unsigned char *) (x8data))[rel_pos];
197
template <class B, WorkingCharacterSet W>
198
ParsingEngine<B, W>::ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external) : Parser_Interface<W> () {
199
Parser_Interface<W>::entity_Info = e;
200
Parser_Interface<W>::model_info = m;
203
// m->symbol_table = new Symbol_Table();
204
// m->SimpleEntity("lt", "<");
205
// m->SimpleEntity("gt", ">");
206
// m->SimpleEntity("amp", "&");
207
// m->SimpleEntity("quot", "\"");
208
// m->SimpleEntity("apos", "'");
209
m->symbol_table->version = e->version;
211
StrictWellFormedness=false;
212
LastAttOccurrence.assign(m->globalAttributeCount+1, 0);
215
bitplex = new Bitplex;
216
buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
218
/* Install sentinels for every lexical item stream*/
219
#ifdef TEMPLATED_SIMD_LIB
220
BitBlock sentinel_value = simd<1>::constant<1>();
222
#ifndef TEMPLATED_SIMD_LIB
223
BitBlock sentinel_value = simd_const_1(1);
226
#ifdef OPTIMIZE_SHORT_SCAN
227
sentinel_value = sisd_sfli(sentinel_value, 8*sizeof(unsigned long));
230
for (int j = minLexicalItem; j < LexicalItemCount; j++) {
231
buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
235
buffer_rel_pos = e->content_start;
236
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
237
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
238
x8data = byteplex->x8data;
239
lexer = Lexer<B::Base>::LexerFactory(e, buf);
240
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
241
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
244
template <class B, WorkingCharacterSet W>
245
ParsingEngine<B, W>::~ParsingEngine() {
246
// How do we do this? Parser_Interface<W>::model_info->~Model_Info();
247
Parser_Interface<W>::entity_Info->~Entity_Info();
248
byteplex->~Byteplex();
250
simd_delete((SIMD_type *) buf);
251
lexer->~Lexer_Interface();
254
template <class B, WorkingCharacterSet W>
255
void ParsingEngine<B, W>::AdvanceBuffers(){
256
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
257
code_clocker->cc_start_interval();
260
int advance_amt = text_or_markup_start - buffer_base_pos;
261
advance_amt &= -PACKSIZE; // maintain alignment
262
byteplex->AdvanceInputBuffer(advance_amt);
263
buffer_base_pos += advance_amt;
264
buffer_rel_pos -= advance_amt;
265
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
266
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
267
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
268
code_clocker->cc_start_interval();
270
byteplex->DoByteplex();
271
byteplex->PreparePseudoASCII_Stream();
272
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
273
code_clocker->cc_end_interval(buffer_limit_pos);
275
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
276
code_clocker->cc_start_interval();
278
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
279
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
280
code_clocker->cc_end_interval(buffer_limit_pos);
282
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
283
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
284
code_clocker->cc_end_interval(buffer_limit_pos);
290
void ParsingEngine<U16_Buffer, UTF_16>::AdvanceBuffers(){
291
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
292
code_clocker->cc_start_interval();
295
int advance_amt = text_or_markup_start - buffer_base_pos;
296
advance_amt &= -PACKSIZE; // maintain alignment
297
byteplex->AdvanceInputBuffer(advance_amt);
298
buffer_base_pos += advance_amt;
299
buffer_rel_pos -= advance_amt;
300
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
301
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
302
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
303
code_clocker->cc_start_interval();
305
byteplex->DoByteplex();
306
if (at_UTF_16(Parser_Interface<UTF_16>::entity_Info->encoding)) ((U16_Buffer *) byteplex)->Validate_UTF16();
307
byteplex->PreparePseudoASCII_Stream();
308
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
309
code_clocker->cc_end_interval(buffer_limit_pos);
311
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
312
code_clocker->cc_start_interval();
314
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
315
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
316
code_clocker->cc_end_interval(buffer_limit_pos);
318
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
319
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
320
code_clocker->cc_end_interval(buffer_limit_pos);
325
template <class B, WorkingCharacterSet W>
326
inline unsigned char * ParsingEngine<B, W>::cur() const {
327
return &((unsigned char *) x8data)[buffer_rel_pos];
330
template <class B, WorkingCharacterSet W>
331
inline int ParsingEngine<B, W>::AbsPos() const {
332
return buffer_base_pos + buffer_rel_pos;
335
template <class B, WorkingCharacterSet W>
336
inline int ParsingEngine<B, W>::LengthFrom(int start_pos) const {
337
return buffer_base_pos + buffer_rel_pos - start_pos;
342
template <class B, WorkingCharacterSet W>
343
inline int ParsingEngine<B, W>::BufferRelPos() const {
344
return buffer_rel_pos;
348
template <class B, WorkingCharacterSet W>
349
inline bool ParsingEngine<B, W>::at_EOF() const {
350
return (buffer_rel_pos >= buffer_limit_pos) &&
351
(buffer_limit_pos < BUFFER_SIZE);
354
//template <class B, WorkingCharacterSet W>
355
//inline void ParsingEngine<B, W>::Advance(int n) {
356
// buffer_rel_pos += n;
357
// if (buffer_rel_pos >= BUFFER_SIZE) {
358
// Parser_Interface<W>::FinalizeBuffer_action();
365
buffer_rel_pos += n; \
366
if (buffer_rel_pos >= BUFFER_SIZE) { \
367
Parser_Interface<W>::FinalizeBuffer_action();\
373
template <class B, WorkingCharacterSet W>
374
void ParsingEngine<B, W>::AdjustBufferEndForIncompleteSequences() {
378
void ParsingEngine<UTF8_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
379
if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
380
else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
381
else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
385
void ParsingEngine<U16_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
386
unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
387
if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
391
void ParsingEngine<UTF8_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
392
if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
393
else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
394
else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
398
void ParsingEngine<U16_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
399
unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
400
if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
405
#ifdef OPTIMIZE_SHORT_SCAN
407
// Inline ScanTo with unrolled first test that should almost always
408
// succeed for short scans.
409
#define ScanTo(item) \
411
unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
412
if (segment != 0) buffer_rel_pos += cfzl(segment);\
414
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
415
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
416
while (buffer_rel_pos >= BUFFER_SIZE) {\
417
buffer_rel_pos = BUFFER_SIZE;\
418
AdjustBufferEndForIncompleteSequences();\
419
Parser_Interface<W>::FinalizeBuffer_action();\
421
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
426
// The following version seems cleaner, but measured mispredictions are higher
427
// #define ScanTo(item) \
429
// unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
430
// while (unlikely (segment == 0)) {\
431
// buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
432
// if (buffer_rel_pos >= BUFFER_SIZE) {\
433
// buffer_rel_pos = BUFFER_SIZE;\
434
// AdjustBufferEndForIncompleteSequences();\
435
// Parser_Interface<W>::FinalizeBuffer_action();\
436
// AdvanceBuffers();\
438
// segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
440
// buffer_rel_pos += cfzl(segment);\
443
// #define ScanTextTo(item) \
445
// unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
446
// text_or_markup_start = AbsPos();\
447
// if (segment != 0) buffer_rel_pos += cfzl(segment);\
449
// buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
450
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
451
// while (buffer_rel_pos >= BUFFER_SIZE) {\
452
// buffer_rel_pos = BUFFER_SIZE;\
453
// AdjustBufferEndForIncompleteSequences();\
454
// Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);\
455
// text_or_markup_start = AbsPos();\
456
// Parser_Interface<W>::FinalizeBuffer_action();\
457
// AdvanceBuffers();\
458
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
463
template <class B, WorkingCharacterSet W>
464
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
465
text_or_markup_start = AbsPos();
466
unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);
467
if (segment != 0) buffer_rel_pos += cfzl(segment);
469
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
470
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
471
while (buffer_rel_pos >= BUFFER_SIZE) {
472
buffer_rel_pos = BUFFER_SIZE;
473
AdjustBufferEndForIncompleteSequences();
474
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
475
text_or_markup_start = AbsPos();
476
Parser_Interface<W>::FinalizeBuffer_action();
478
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
485
#ifndef OPTIMIZE_SHORT_SCAN
487
// #define ScanTo(item) \
489
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
490
// while (buffer_rel_pos >= BUFFER_SIZE) {\
491
// AdjustBufferEndForIncompleteSequences();\
492
// Parser_Interface<W>::FinalizeBuffer_action();\
493
// AdvanceBuffers();\
494
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
499
template <class B, WorkingCharacterSet W>
500
inline void ParsingEngine<B, W>::ScanTo(int item) {
501
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
502
while (buffer_rel_pos >= BUFFER_SIZE) {
503
AdjustBufferEndForIncompleteSequences();
504
Parser_Interface<W>::FinalizeBuffer_action();
506
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
510
template <class B, WorkingCharacterSet W>
511
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
512
text_or_markup_start = AbsPos();
513
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
514
while (buffer_rel_pos >= BUFFER_SIZE) {
515
AdjustBufferEndForIncompleteSequences();
516
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
517
text_or_markup_start = AbsPos();
518
Parser_Interface<W>::FinalizeBuffer_action();
520
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
525
template <class B, WorkingCharacterSet W>
526
void ParsingEngine<B, W>::WF_Error (XML_Constraint errCode) {
527
printf("Error at position %i in input.\n", AbsPos());
528
ShowConstraintError(errCode);
530
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
534
template <class B, WorkingCharacterSet W>
535
void ParsingEngine<B, W>::Validity_Error (XML_Constraint errCode) {
536
printf("Error at position %i in input.\n", AbsPos());
537
ShowConstraintError(errCode);
539
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
542
template <class B, WorkingCharacterSet W>
543
void ParsingEngine<B, W>::Syntax_Error (XML_NonTerminal errNT) {
544
printf("Error at position %i in input.\n", AbsPos());
545
ShowSyntaxError(errNT);
547
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
551
/* Parse a comment beginning "<!--" */
552
template <class B, WorkingCharacterSet W>
553
void ParsingEngine<B, W>::Parse_Comment() {
555
Advance(4); /* Skip "<!--". */
557
while (!at_DoubleHyphen<B::Base>(cur())) {
559
Syntax_Error(NT_CDSect);
560
Advance(2); /* Skip hyphen-nonhyphen pair */
563
if (at_Comment_End<B::Base>(cur())) {
564
Advance(3); /* Skip "-->". */
565
Comment_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
568
Advance(2); /* "--" */
569
Syntax_Error(NT_Comment);
573
/* Parse an end tag beginning "</" */
574
template <class B, WorkingCharacterSet W>
575
inline void ParsingEngine<B, W>::Parse_EndTag() {
576
Advance(2); /* Skip "</". */
577
int nameID = Parse_Name();
578
if (AtChar<B::Base,'>'>(cur())) {
580
EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
584
if (AtChar<B::Base,'>'>(cur())) {
586
EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
588
else Syntax_Error(NT_ETag);
592
/* Parse a CDATA section beginning "<![CDATA". */
593
template <class B, WorkingCharacterSet W>
594
void ParsingEngine<B, W>::Parse_CDATA() {
595
Advance(8); /* Skip "<![CDATA". */
596
if (!AtChar<B::Base,'['>(cur())) {
597
Syntax_Error(NT_CDStart);
601
CDATA_start_action(GetCodeUnitPtr(text_or_markup_start));
602
text_or_markup_start = AbsPos();
603
ScanTextTo(CD_End_check);
604
while (!at_CDATA_End<B::Base>(cur())) {
606
Syntax_Error(NT_CDSect);
608
ScanTextTo(CD_End_check);
610
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
611
Advance(3); /* Skip "]]>". */
612
CDATA_end_action(GetCodeUnitPtr(AbsPos()));
616
template <class B, WorkingCharacterSet W>
617
void ParsingEngine<B, W>::Parse_EntityRef() {
618
Advance(1); // skip "&"
619
int nameID = Parse_Name(); /* Name delimiter */
620
if (!AtChar<B::Base,';'>(cur())) {
621
Syntax_Error(NT_Reference);
625
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
627
// The following code will replace Reference_Action.
628
GEntity_info * this_info;
629
Parser_Interface<W> * entity_parser;
630
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
632
WF_Error(wfErr_wf_entdeclared);
634
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
635
if (this_info->is_external){
637
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
638
WF_Error(wfErr_NoExternalRefs);
640
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
641
entity_parser->Parse_WF_Content();
642
if(!entity_parser->at_EOF())
643
Syntax_Error(NT_content);
644
entity_parser->~Parser_Interface<W>();
648
if (this_info->is_simple == true);
649
// printf("Entity is %s\n",this_info->ReplacementText);
651
// printf("Not a simple text: %s\n",this_info->ReplacementText);
652
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
653
entity_parser->Parse_WF_Content();
654
if(!entity_parser->at_EOF())
655
Syntax_Error(NT_content);
656
entity_parser->~Parser_Interface<W>();
664
template <class B, WorkingCharacterSet W>
665
void ParsingEngine<B, W>::Parse_EntityRef_inMixed(symbol_set_t elems) {
666
Advance(1); // skip "&"
667
int nameID = Parse_Name(); /* Name delimiter */
668
if (!AtChar<B::Base,';'>(cur())) {
669
Syntax_Error(NT_Reference);
673
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
675
// The following code will replace Reference_Action.
676
GEntity_info * this_info;
677
Parser_Interface<W> * entity_parser;
678
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
680
WF_Error(wfErr_wf_entdeclared);
682
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
683
if (this_info->is_external){
685
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
686
WF_Error(wfErr_NoExternalRefs);
688
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
689
entity_parser->Parse_MixedContent(elems);
690
if(!entity_parser->at_EOF())
691
Syntax_Error(NT_content);
692
entity_parser->~Parser_Interface<W>();
696
if (this_info->is_simple == true);
697
// printf("Entity is %s\n",this_info->ReplacementText);
699
// printf("Not a simple text: %s\n",this_info->ReplacementText);
700
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
701
entity_parser->Parse_MixedContent(elems);
702
if(!entity_parser->at_EOF())
703
Syntax_Error(NT_content);
704
entity_parser->~Parser_Interface<W>();
712
template <class B, WorkingCharacterSet W>
713
void ParsingEngine<B, W>::Parse_EntityRef_inAnyContent() {
714
Advance(1); // skip "&"
715
int nameID = Parse_Name(); /* Name delimiter */
716
if (!AtChar<B::Base,';'>(cur())) {
717
Syntax_Error(NT_Reference);
721
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
723
// The following code will replace Reference_Action.
724
GEntity_info * this_info;
725
Parser_Interface<W> * entity_parser;
726
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
728
WF_Error(wfErr_wf_entdeclared);
730
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
731
if (this_info->is_external){
733
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
734
WF_Error(wfErr_NoExternalRefs);
736
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
737
entity_parser->Parse_AnyContent();
738
if(!entity_parser->at_EOF())
739
Syntax_Error(NT_content);
740
entity_parser->~Parser_Interface<W>();
744
if (this_info->is_simple == true);
745
// printf("Entity is %s\n",this_info->ReplacementText);
747
// printf("Not a simple text: %s\n",this_info->ReplacementText);
748
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
749
entity_parser->Parse_AnyContent();
750
if(!entity_parser->at_EOF())
751
Syntax_Error(NT_content);
752
entity_parser->~Parser_Interface<W>();
760
template <class B, WorkingCharacterSet W>
761
void ParsingEngine<B, W>::Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state) {
762
Advance(1); // skip "&"
763
int nameID = Parse_Name(); /* Name delimiter */
764
if (!AtChar<B::Base,';'>(cur())) {
765
Syntax_Error(NT_Reference);
769
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
771
// The following code will replace Reference_Action.
772
GEntity_info * this_info;
773
Parser_Interface<W> * entity_parser;
774
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
776
WF_Error(wfErr_wf_entdeclared);
778
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
779
if (this_info->is_external){
781
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
782
WF_Error(wfErr_NoExternalRefs);
784
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
785
entity_parser->Parse_ValidContent(cre, cur_state);
786
if(!entity_parser->at_EOF())
787
Syntax_Error(NT_content);
788
entity_parser->~Parser_Interface<W>();
792
if (this_info->is_simple == true);
793
// printf("Entity is %s\n",this_info->ReplacementText);
795
// printf("Not a simple text: %s\n",this_info->ReplacementText);
796
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
797
entity_parser->Parse_ValidContent(cre, cur_state);
798
if(!entity_parser->at_EOF())
799
Syntax_Error(NT_content);
800
entity_parser->~Parser_Interface<W>();
808
template <class B, WorkingCharacterSet W>
809
void ParsingEngine<B, W>::Parse_CharRef() {
810
Advance(2); // skip "&#"
812
if (AtChar<B::Base,'x'>(cur())) {
814
while(at_HexDigit<B::Base>(cur())){
815
ch_val = HexVal<B::Base>(cur()[0]) + (ch_val<<4);
816
if (ch_val> 0x10FFFF )
817
WF_Error(wfErr_wf_Legalchar);
822
while(at_Digit<B::Base>(cur())){
823
ch_val = DigitVal<B::Base>(cur()[0]) + ch_val*10;
824
if (ch_val> 0x10FFFF )
825
WF_Error(wfErr_wf_Legalchar);
829
if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF))
830
WF_Error(wfErr_wf_Legalchar);
831
else if (Parser_Interface<W>::entity_Info->version != XML_1_1)
832
if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA)))
833
WF_Error(wfErr_wf_Legalchar);
835
if (!AtChar<B::Base,';'>(cur())) {
836
Syntax_Error(NT_CharRef);
840
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
844
template <class B, WorkingCharacterSet W>
845
void ParsingEngine<B, W>::Parse_PI (){
847
Advance(2); /* Skip "<?". */
848
int target_start = AbsPos();
849
if (at_XxMmLll<B::Base>(cur())) {
850
nameID = Parse_Name();
851
if (AbsPos() - target_start == 3) Syntax_Error(NT_PI);
853
else nameID = Parse_Name();
854
PI_Target_action(GetCodeUnitPtr(target_start), LengthFrom(target_start));
855
if (!at_PI_End<B::Base>(cur())) requireWS();
857
while (!at_PI_End<B::Base>(cur())) {
863
Advance(2); /* Skip "?>". */
864
PI_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
867
/* Parse a start or empty element tag. */
868
template <class B, WorkingCharacterSet W>
869
inline void ParsingEngine<B, W>::Parse_StartTag (){
872
int att_name_end, att_val_end;
873
unsigned char quoteCh;
875
int nameID = Parse_Name(); /* Name delimiter: WS, "/" or ">" */
876
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
877
/* The following test optimizes the most common case of a
878
start tag with no attributes. */
879
if (AtChar<B::Base,'>'>(cur())) {
881
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
885
if (AtChar<B::Base,'>'>(cur())) {
887
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
889
else if (at_EmptyElementDelim<B::Base>(cur())) {
891
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
894
/* Must be an attribute-value pair or error. */
895
att_name_start = AbsPos();
896
int att_nameID = Parse_Name();
897
att_name_end = AbsPos();
899
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
900
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
902
if (LastAttOccurrence[attID] > text_or_markup_start) {
903
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
907
LastAttOccurrence[attID] = att_name_start;
908
/* The following optimized tests handle the frequently occurring
909
case that there are no blanks on either side of the equals sign.
910
In many cases, the very first test handles 100% of actual
911
attribute-value pairs encountered. */
912
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
915
if (!AtChar<B::Base,'='>(cur())) {
916
Syntax_Error(NT_STag);
921
if (!AtQuote<B::Base>(cur())) {
922
Syntax_Error(NT_STag);
926
att_val_start = AbsPos()+1;
928
att_val_end = AbsPos()-1;
929
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
930
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
931
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
934
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
935
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
937
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
938
if (AtChar<B::Base,'>'>(cur())) {
940
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
943
else if (at_EmptyElementDelim<B::Base>(cur())) {
945
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
949
if (AtChar<B::Base,'>'>(cur())) {
951
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
954
else if (at_EmptyElementDelim<B::Base>(cur())) {
956
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
959
else if (AbsPos() == att_val_end + 1) {
960
/* No WS following att value */
961
Syntax_Error(NT_STag);
968
template <class B, WorkingCharacterSet W>
969
inline void ParsingEngine<B, W>::text_if_nonnull_action(bool more){
970
if (AbsPos() > text_or_markup_start) {
971
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), more);
972
text_or_markup_start = AbsPos();
976
template <class B, WorkingCharacterSet W>
977
void ParsingEngine<B, W>::Parse_WF_EndTag(int nameID) {
979
int end_nameID = Parse_Name();
980
if(end_nameID != nameID)
981
WF_Error(wfErr_GIMatch);
982
if (AtChar<B::Base,'>'>(cur())) {
984
Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
988
if (AtChar<B::Base,'>'>(cur())) {
990
Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
992
else Syntax_Error(NT_ETag);
997
// void ParsingEngine<UTF8_Buffer, UTF_8>::Parse_WF_EndTag(int nameID) {
998
// Advance(2); /* Skip "</". */
1000
// int name_start = AbsPos();
1001
// // ScanTo(NameFollow);
1002
// // int lgth = AbsPos()-name_start;
1004
// #if (not defined(OMISSION)) or ((OMISSION != END_TAG_MATCHING) and (OMISSION != NAME_LOOKUP))
1005
// char * start_elem_name = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_name(nameID);
1006
// int lgth = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_lgth(nameID);
1007
// char * end_elem_name = &((char *) x8data)[buffer_rel_pos];
1009
// #ifdef TEMPLATED_SIMD_LIB
1010
// BytePack byte_compare = simd<8>::eq(sisd_load_unaligned((BytePack *) end_elem_name),
1011
// sisd_load_unaligned((BytePack *) start_elem_name));
1013
// #ifndef TEMPLATED_SIMD_LIB
1014
// BytePack byte_compare = simd_eq_8(sisd_load_unaligned((BytePack *) end_elem_name),
1015
// sisd_load_unaligned((BytePack *) start_elem_name));
1018
// int expected_bits = ~(-1 << lgth);
1019
// if ((_mm_movemask_epi8(byte_compare) & expected_bits) != expected_bits) {
1020
// WF_Error(wfErr_GIMatch);
1024
// /* Must compare with bytes beyond the first 16. Set up to
1025
// compare 16 bytes at a time, with the first additional compare
1026
// overlapping with the first byte_compare. */
1027
// int pos = (lgth - 1) % PACKSIZE + 1;
1028
// #ifdef TEMPLATED_SIMD_LIB
1029
// byte_compare = simd_or(byte_compare, simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1030
// sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1032
// #ifndef TEMPLATED_SIMD_LIB
1033
// byte_compare = simd_or(byte_compare, simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1034
// sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1037
// while (pos < lgth) {
1038
// if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1039
// WF_Error(wfErr_GIMatch);
1041
// #ifdef TEMPLATED_SIMD_LIB
1042
// byte_compare = simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1043
// sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1045
// #ifndef TEMPLATED_SIMD_LIB
1046
// byte_compare = simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1047
// sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1051
// if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1052
// WF_Error(wfErr_GIMatch);
1058
// #if defined(OMISSION) and ((OMISSION == END_TAG_MATCHING) or (OMISSION == NAME_LOOKUP))
1059
// ScanTo(NameFollow);
1061
// // for(int i=0; i<lgth; i++) {
1062
// // if (start_elem_name[i] != end_elem_name[i])
1063
// // WF_Error(wfErr_GIMatch);
1065
// // if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
1067
// if (AtChar<ASCII,'>'>(cur())) {
1069
// Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1073
// if (AtChar<ASCII,'>'>(cur())) {
1075
// Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1077
// else Syntax_Error(NT_ETag);
1082
/* Parse a valid start or empty element tag. */
1083
template <class B, WorkingCharacterSet W>
1084
int ParsingEngine<B, W>::Parse_WF_StartTag (bool& is_emptyStartTag){
1087
int att_name_end, att_val_end;
1088
unsigned char quoteCh;
1091
#if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1092
int nameID = Parse_Name();
1094
#if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1098
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
1099
/* The following test optimizes the most common case of a
1100
start tag with no attributes. */
1101
if (AtChar<B::Base,'>'>(cur())) {
1103
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1107
if (AtChar<B::Base,'>'>(cur())) {
1109
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1111
else if (at_EmptyElementDelim<B::Base>(cur())) {
1113
is_emptyStartTag = true;
1114
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1117
/* Must be an attribute-value pair or error. */
1118
att_name_start = AbsPos();
1119
#if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1120
int att_nameID = Parse_Name();
1122
#if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1126
att_name_end = AbsPos();
1127
#if (not defined(OMISSION)) or ((OMISSION != ATTRIBUTE_UNIQUENESS) and (OMISSION != NAME_LOOKUP))
1128
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1129
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1131
if (LastAttOccurrence[attID] > text_or_markup_start) {
1132
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
1136
LastAttOccurrence[attID] = att_name_start;
1138
/* The following optimized tests handle the frequently occurring
1139
case that there are no blanks on either side of the equals sign.
1140
In many cases, the very first test handles 100% of actual
1141
attribute-value pairs encountered. */
1142
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
1145
if (!AtChar<B::Base,'='>(cur())) {
1146
Syntax_Error(NT_STag);
1151
if (!AtQuote<B::Base>(cur())) {
1152
Syntax_Error(NT_STag);
1156
att_val_start = AbsPos()+1;
1158
att_val_end = AbsPos()-1;
1159
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
1160
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1161
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1164
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1165
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1167
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
1168
if (AtChar<B::Base,'>'>(cur())) {
1170
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1173
else if (at_EmptyElementDelim<B::Base>(cur())) {
1175
is_emptyStartTag = true;
1176
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1180
if (AtChar<B::Base,'>'>(cur())) {
1182
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1185
else if (at_EmptyElementDelim<B::Base>(cur())) {
1187
is_emptyStartTag = true;
1188
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1191
else if (AbsPos() == att_val_end + 1) {
1192
/* No WS following att value */
1193
Syntax_Error(NT_STag);
1203
template <class B, WorkingCharacterSet W>
1204
void ParsingEngine<B, W>::Parse_WF_Element() {
1205
bool is_emptyStartTag = false;
1206
int nameID = Parse_WF_StartTag(is_emptyStartTag);
1208
printf("Parse_Element: nameID = %d, is_emptyStartTag=%i\n",nameID, is_emptyStartTag);
1210
if (!is_emptyStartTag) {
1212
Parse_WF_EndTag(nameID);
1217
template <class B, WorkingCharacterSet W>
1218
void ParsingEngine<B, W>::Parse_WF_Content() {
1220
text_or_markup_start = AbsPos();
1221
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1222
if (at_ElementTag_Start<B::Base>(cur())) {
1223
text_if_nonnull_action(false);
1226
else if (at_EndTag_Start<B::Base>(cur())) {
1227
text_if_nonnull_action(false);
1230
else if (at_Comment_Start<B::Base>(cur())) {
1231
text_if_nonnull_action(false);
1234
else if (at_CharRef_Start<B::Base>(cur())) {
1235
text_if_nonnull_action(true);
1238
else if (AtChar<B::Base,'&'>(cur())) {
1239
text_if_nonnull_action(true);
1242
else if (at_CDATA_Start<B::Base>(cur())) {
1243
text_if_nonnull_action(true);
1246
else if (at_PI_Start<B::Base>(cur())) {
1247
text_if_nonnull_action(false);
1250
else if (at_CDATA_End<B::Base>(cur())) {
1251
text_if_nonnull_action(true);
1253
Syntax_Error(NT_CharData);
1255
else if (at_EOF()) {
1256
text_if_nonnull_action(false);
1259
else if (AtChar<B::Base,'<'>(cur())) {
1260
Syntax_Error(NT_markupdecl);
1270
#ifndef MARKUP_PASS_CONTROL
1271
#ifndef MARKUP_SORTING
1272
template <class B, WorkingCharacterSet W>
1273
void ParsingEngine<B, W>::ParseContent() {
1274
Parser_Interface<W>::DocumentStart_action();
1275
bool is_emptyStartTag = false;
1277
text_or_markup_start = AbsPos();
1278
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1279
/* if (AtChar<B::Base,'<'>(cur())) {
1280
text_if_nonnull_action();
1281
Parse_Markup<B, W>();
1283
if (at_ElementTag_Start<B::Base>(cur())) {
1284
text_if_nonnull_action(false);
1287
else if (at_EndTag_Start<B::Base>(cur())) {
1288
text_if_nonnull_action(false);
1291
else if (at_Comment_Start<B::Base>(cur())) {
1292
text_if_nonnull_action(false);
1295
else if (at_CharRef_Start<B::Base>(cur())) {
1296
text_if_nonnull_action(true);
1299
else if (AtChar<B::Base,'&'>(cur())) {
1300
text_if_nonnull_action(true);
1303
else if (at_CDATA_Start<B::Base>(cur())) {
1304
text_if_nonnull_action(true);
1307
else if (at_PI_Start<B::Base>(cur())) {
1308
text_if_nonnull_action(false);
1311
else if (at_CDATA_End<B::Base>(cur())) {
1312
text_if_nonnull_action(true);
1314
Syntax_Error(NT_CharData);
1316
else if (at_EOF()) {
1317
text_if_nonnull_action(false);
1320
else if (AtChar<B::Base,'<'>(cur())) {
1321
Syntax_Error(NT_markupdecl);
1328
Parser_Interface<W>::DocumentEnd_action();
1333
template <class B, WorkingCharacterSet W>
1334
void ParsingEngine<B, W>::Parse_DocType (){
1336
int old_abspos, start_pos;
1338
start_pos = AbsPos();
1340
if (at_DOCTYPE_start<B::Base>(cur()))
1343
// printf("No Document definition!\n");
1347
int nameID = Parse_Name();
1349
old_abspos = AbsPos();
1351
if(at_SYSTEM<B::Base>(cur())||at_PUBLIC<B::Base>(cur())){
1352
Parser_Interface<W>::model_info->has_external_DTD = true;
1353
if(old_abspos == AbsPos())
1354
Syntax_Error(NT_doctypedecl);
1355
Parse_ExternalID(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info->external_DTD_pubidLiteral);
1356
Parser_Interface<W> * entity_parser;
1357
entity_parser = ParserFactory(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info);
1358
entity_parser->Parse_ExtSubsetDecl();
1359
entity_parser->~Parser_Interface<W>();
1361
else Parser_Interface<W>::model_info->has_external_DTD = false;
1364
if (AtChar<B::Base,'['>(cur())){
1367
if (AtChar<B::Base,']'>(cur()))
1370
Syntax_Error(NT_doctypedecl);
1374
if (AtChar<B::Base,'>'>(cur())){
1377
CRE_Seq * rslt = new CRE_Seq();
1378
rslt->subCMs.push_back(new CRE_Name(nameID));
1379
CM_RegExp * cre = new CM_RegExp();
1380
cre->content_re = rslt;
1382
int id_count = cre->content_re->Set_IDs(0);
1383
cre->content_re->Set_First_Map();
1384
symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1385
cre->content_re->follow_map[0] = id_count+1;
1387
cre->content_re->Set_Follow_Map(transition_map);
1388
transition_map[0] = cre->content_re->first_map;
1389
if (cre->content_re->matches_empty)
1390
transition_map[0][0]=id_count+1;
1392
cre -> transition_map = transition_map;
1394
Parser_Interface<W>::model_info->rootModel = cre;
1396
/* Check for notations that were used, but not defined by the end of the DTD. */
1397
#if (VALIDATION_MODE == ON)
1398
hash_map<int, int >::iterator j;
1399
for (j=Parser_Interface<W>::model_info->GlobalNotationTable.begin(); j!=Parser_Interface<W>::model_info->GlobalNotationTable.end(); j++) {
1400
if (j->second == -1)
1401
Validity_Error(vErr_notatn);
1406
Syntax_Error(NT_doctypedecl);
1409
template <class B, WorkingCharacterSet W>
1410
void ParsingEngine<B, W>::Parse_ExternalID (char *& systemLiteral, char *& pubidLiteral){
1411
int quot_start, lgth;
1412
if(at_SYSTEM<B::Base>(cur())){
1414
pubidLiteral = NULL;
1416
if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1417
quot_start = AbsPos()+1;
1418
Parse_SystemLiteral (); /* SystemLiteral */
1419
lgth = AbsPos() - quot_start - 1;
1420
systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1422
else if (at_PUBLIC<B::Base>(cur())){
1425
if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1426
quot_start = AbsPos()+1;
1427
Parse_PubidLiteral ();/* PubidLiteral */
1428
lgth = AbsPos() - quot_start - 1;
1429
pubidLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1430
systemLiteral = NULL;
1431
if (AtChar<B::Base, '>'>(cur())) return;
1433
if (AtQuote<B::Base>(cur())) {
1434
quot_start = AbsPos()+1;
1435
Parse_SystemLiteral ();/* SystemLiteral */
1436
lgth = AbsPos() - quot_start - 1;
1437
systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1441
Syntax_Error(NT_ExternalID);
1444
template <class B, WorkingCharacterSet W>
1445
void ParsingEngine<B, W>::Parse_SystemLiteral (){
1446
unsigned char quoteCh;
1447
if(AtQuote<B::Base>(cur())){
1452
while (cur()[0] != quoteCh){
1454
Syntax_Error(NT_SystemLiteral);
1461
template <class B, WorkingCharacterSet W>
1462
void ParsingEngine<B, W>::Parse_PubidLiteral (){
1463
unsigned char quoteCh;
1466
while (at_PubidChar<B::Base>(cur()) && (cur()[0] != quoteCh)) {
1469
if (cur()[0] != quoteCh){
1470
Syntax_Error(NT_PubidLiteral);
1475
template <class B, WorkingCharacterSet W>
1476
void ParsingEngine<B, W>::Parse_IntSubset (){
1480
text_or_markup_start = AbsPos();
1481
if (AtChar<B::Base,'%'>(cur()))
1482
Parse_PEReference();
1483
else if (at_PI_Start<B::Base>(cur())) {
1486
else if (at_Comment_Start<B::Base>(cur())) {
1489
else if (AtChar<B::Base,'<'>(cur())){
1491
if(AtChar<B::Base,'!'>(cur())){
1493
if (at_ELEMENT<B::Base>(cur()))
1494
Parse_Elementdecl();
1495
else if (at_ATTLIST<B::Base>(cur()))
1496
Parse_AttlistDecl();
1497
else if (at_ENTITY<B::Base>(cur()))
1499
else if (at_NOTATION<B::Base>(cur()))
1500
Parse_Notationdecl();
1502
Syntax_Error(NT_markupdecl);
1506
Syntax_Error(NT_markupdecl);
1508
else if (AtChar<B::Base,']'>(cur())){
1512
Syntax_Error(NT_intSubset);
1517
template <class B, WorkingCharacterSet W>
1518
void ParsingEngine<B, W>::Parse_PEReference (){
1520
Advance(1); /* Skip "%". */
1521
fprintf(stderr,"Parameter Reference has not been completed yet.\n");
1523
int nameID = Parse_Name();
1524
if (AtChar<B::Base,';'>(cur())) {
1526
PEReference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1527
PEntity_info * this_info;
1528
Parser_Interface<W> * entity_parser;
1529
int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
1531
WF_Error(wfErr_wf_entdeclared);
1533
this_info = Parser_Interface<W>::model_info->PEntityData[entityID-1];
1534
if (this_info->is_external){
1536
// if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
1537
// WF_Error(wfErr_NoExternalRefs);
1539
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
1540
entity_parser->Parse_WF_Content();
1541
if(!entity_parser->at_EOF())
1542
Syntax_Error(NT_content);
1543
entity_parser->~Parser_Interface<W>();
1551
Syntax_Error(NT_PEReference);
1555
template <class B, WorkingCharacterSet W>
1556
void ParsingEngine<B, W>::Parse_Elementdecl (){
1558
Advance(7); /* Skip "<!ELEMENT". */
1561
int nameID = Parse_Name();
1562
int elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1566
/* Start parsing "contentspec"*/
1567
if (at_EMPTY<B::Base>(cur())) {
1569
cm = new CM_Empty();
1570
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1572
else if (at_ANY<B::Base>(cur())) {
1575
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1578
if (AtChar<B::Base,'('>(cur()))
1581
if (at_PCDATA<B::Base>(cur())){
1582
cm = Parse_RemainingMixed();
1583
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1587
CM_RegExp * cre = new CM_RegExp;
1588
cre->content_re = Parse_RemainingChildren();
1590
int id_count = cre->content_re->Set_IDs(0);
1591
cre->content_re->Set_First_Map();
1592
symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1593
cre->content_re->follow_map[0] = id_count+1;
1595
cre->content_re->Set_Follow_Map(transition_map);
1596
transition_map[0] = cre->content_re->first_map;
1598
if (cre->content_re->matches_empty)
1599
transition_map[0][0]=id_count+1;
1601
cre -> transition_map = transition_map;
1603
Parser_Interface<W>::model_info->ContentModelData[nameID] = cre;
1609
if (AtChar<B::Base,'>'>(cur())) {
1613
Syntax_Error(NT_elementdecl);
1615
template <class B, WorkingCharacterSet W>
1616
ContentModel * ParsingEngine<B, W>::Parse_RemainingMixed (){
1617
CM_Mixed * r = new CM_Mixed();
1618
Advance(7); /* Skip "#PCDATA". */
1620
if (AtChar<B::Base,')'>(cur())){
1621
if (AtChar<B::Base,'*'>(cur())) {
1631
while (AtChar<B::Base,'|'>(cur())){
1634
int nameID = Parse_Name();
1635
r->elements[nameID] = ++k;
1638
if (at_Para_star<B::Base>(cur())) Advance(2);
1640
Syntax_Error(NT_Mixed);
1648
template <class B, WorkingCharacterSet W>
1649
Content_RE * ParsingEngine<B, W>::Parse_RemainingChildren (){
1650
Content_RE * c1 = Parse_Cp();
1651
Content_RE * r = c1;
1653
if(AtChar<B::Base,'|'>(cur())){
1654
CRE_Choice * rslt = new CRE_Choice;
1655
rslt->subCMs.push_back(c1);
1658
rslt->subCMs.push_back(Parse_Cp());
1660
while(!AtChar<B::Base,')'>(cur())){
1661
if(AtChar<B::Base,'|'>(cur()))
1664
Syntax_Error(NT_children);
1666
rslt->subCMs.push_back(Parse_Cp());
1673
else if(AtChar<B::Base,','>(cur())){
1674
CRE_Seq * rslt = new CRE_Seq;
1675
rslt->subCMs.push_back(c1);
1678
rslt->subCMs.push_back(Parse_Cp());
1680
while(!AtChar<B::Base,')'>(cur())){
1681
if(AtChar<B::Base,','>(cur()))
1684
Syntax_Error(NT_children);
1686
rslt->subCMs.push_back(Parse_Cp());
1693
else if(AtChar<B::Base,')'>(cur())){
1697
Syntax_Error(NT_children);
1699
if (AtChar<B::Base,'?'>(cur())) {
1703
else if (AtChar<B::Base,'*'>(cur())) {
1705
r = new CRE_Star(r);
1707
else if (AtChar<B::Base,'+'>(cur())) {
1709
r = new CRE_Plus(r);
1715
template <class B, WorkingCharacterSet W>
1716
Content_RE * ParsingEngine<B, W>::Parse_Cp (){
1717
if (AtChar<B::Base,'('>(cur())){
1720
Parse_RemainingChildren();
1723
int nameID = Parse_Name();
1724
CRE_Name * r = new CRE_Name(nameID);
1726
if (AtChar<B::Base,'?'>(cur())) {
1728
return new CRE_Opt(r);
1730
else if (AtChar<B::Base,'*'>(cur())) {
1732
return new CRE_Star(r);
1734
else if (AtChar<B::Base,'+'>(cur())) {
1736
return new CRE_Plus(r);
1742
template <class B, WorkingCharacterSet W>
1743
void ParsingEngine<B, W>::Parse_AttlistDecl (){
1753
Advance(7); /* Skip "ATTLIST. */
1756
int nameID = Parse_Name();
1757
elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1759
old_abspos = AbsPos();
1761
while(!AtChar<B::Base,'>'>(cur())) {
1762
if(old_abspos == AbsPos())
1763
Syntax_Error(NT_AttlistDecl);
1765
int att_nameID = Parse_Name();
1767
attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1768
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1769
ATT_info * this_info = new ATT_info;
1770
this_info->globalATT_id = attID;
1772
if (at_CDATA<B::Base>(cur())){
1774
this_info->attType = CDATA_att;
1776
else if(at_ID<B::Base>(cur())){
1778
this_info->attType = ID_att;
1780
/* Make sure to check IDREFS before IDREF*/
1781
else if(at_IDREFS<B::Base>(cur())){
1783
this_info->attType = IDREFS_att;
1785
else if(at_IDREF<B::Base>(cur())){
1787
this_info->attType = IDREF_att;
1789
else if(at_ENTITY<B::Base>(cur())){
1791
this_info->attType = ENTITY_att;
1793
else if(at_ENTITIES<B::Base>(cur())){
1795
this_info->attType = ENTITIES_att;
1797
/* Make sure to check NMTOKENS before NMTOKEN*/
1798
else if(at_NMTOKENS<B::Base>(cur())){
1800
this_info->attType = NMTOKENS_att;
1802
else if(at_NMTOKEN<B::Base>(cur())){
1804
this_info->attType = NMTOKEN_att;
1806
else if(at_NOTATION<B::Base>(cur())){ /* NotationType = 'NOTATION' S Enumeration
1807
when Nmtoken = Name */
1810
Parse_Notation(this_info);
1811
this_info->attType = NOTATION_att;
1813
else if(AtChar<B::Base,'('>(cur())){
1814
Parse_Enumeration(this_info);
1815
this_info->attType = enumeration_att;
1818
Syntax_Error(NT_AttlistDecl);
1820
Parse_DefaultDecl(this_info);
1823
Parser_Interface<W>::model_info->ElementAttributeData[elemID].push_back(this_info);
1829
template <class B, WorkingCharacterSet W>
1830
void ParsingEngine<B, W>::Parse_Notation (ATT_info * this_info){
1832
if(AtChar<B::Base,'('>(cur()))
1835
Syntax_Error(NT_NotationType);
1838
int notn_nameID = Parse_Name();
1840
/*Notation name is not in the global table!*/
1841
if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1842
Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1845
while(AtChar<B::Base,'|'>(cur())){
1848
notn_nameID = Parse_Name();
1850
if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1851
// Validity_Error(vErr_notatn);
1852
Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1856
if (AtChar<B::Base,')'>(cur()))
1859
Syntax_Error(NT_NotationType);
1862
template <class B, WorkingCharacterSet W>
1863
void ParsingEngine<B, W>::Parse_Enumeration (ATT_info * this_info){
1866
if(AtChar<B::Base,'('>(cur()))
1869
Syntax_Error(NT_Enumeration);
1872
int nmtokenID = Parse_Nmtoken();
1874
this_info->enumValues[nmtokenID]=++(enumCount);
1877
while(AtChar<B::Base,'|'>(cur())){
1880
int nmtokenID = Parse_Nmtoken();
1882
int enumID = this_info->enumValues[nmtokenID];
1884
this_info->enumValues[nmtokenID]=++(enumCount);
1887
else if(!StrictWellFormedness){
1888
Validity_Error(vErr_NoDuplicateTokens);
1892
if (AtChar<B::Base,')'>(cur()))
1895
Syntax_Error(NT_Enumeration);
1898
template <class B, WorkingCharacterSet W>
1899
void ParsingEngine<B, W>::Parse_DefaultDecl (ATT_info * this_info){
1900
if(at_REQUIRED<B::Base>(cur())){
1902
this_info->defaultKind = REQUIRED_att;
1904
else if(at_IMPLIED<B::Base>(cur())){
1906
this_info->defaultKind = IMPLIED_att;
1909
if(at_FIXED<B::Base>(cur())){
1912
this_info->defaultKind = FIXED_att;
1914
else this_info->defaultKind = DEFAULT_att;
1915
if(AtQuote<B::Base>(cur())){
1916
int quot_start = AbsPos()+1;
1918
/* need to normalize */
1919
this_info->defaultValueLgth = AbsPos() - quot_start - 1;
1921
this_info->defaultValue = new unsigned char[this_info->defaultValueLgth+1];
1922
memcpy(this_info->defaultValue, GetCodeUnitPtr(quot_start),this_info->defaultValueLgth);
1923
this_info->defaultValue[this_info->defaultValueLgth] = '\0';
1926
Syntax_Error(NT_DefaultDecl);
1930
template <class B, WorkingCharacterSet W>
1931
void ParsingEngine<B, W>::Parse_Entitydecl (){
1939
Advance(6); /* Skip "ENTITY. */
1942
if (AtChar<B::Base,'%'>(cur())){
1946
int nameID = Parse_Name();
1947
PEntity_info * this_info = new PEntity_info;
1948
int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
1950
Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalPEntityCount);
1951
entityID = Parser_Interface<W>::model_info->globalPEntityCount;
1952
this_info->globalPEntity_id = entityID;
1955
printf("Warning: Entity definition already exist!\n");
1958
if(AtQuote<B::Base>(cur())){
1959
Parse_PEntityValue(this_info);
1960
this_info->is_external = false;
1963
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1964
this_info->is_external = true;
1965
if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1967
Parser_Interface<W>::model_info->PEntityData.push_back(this_info);
1970
int nameID = Parse_Name();
1972
GEntity_info * this_info = new GEntity_info();
1973
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
1975
Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalGEntityCount);
1976
entityID = Parser_Interface<W>::model_info->globalGEntityCount;
1977
this_info->globalGEntity_id = entityID;
1980
printf("Warning: Entity definition already exists!\n");
1984
if(AtQuote<B::Base>(cur())){
1985
Parse_GEntityValue(this_info);
1986
this_info->is_external = false;
1989
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1990
this_info->is_external = true;
1991
if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1992
old_abspos = AbsPos();
1994
if(at_NDATA<B::Base>(cur())){
1995
if(old_abspos == AbsPos())
1996
Syntax_Error(NT_EntityDecl);
2000
name_start = AbsPos();
2001
int nameID = Parse_Name();
2002
lgth = AbsPos() - name_start;
2003
this_info->NDataName = copy_string(GetCodeUnitPtr(name_start),lgth);
2006
Parser_Interface<W>::model_info->GEntityData.push_back(this_info);
2009
if (AtChar<B::Base,'>'>(cur())){
2013
Syntax_Error(NT_EntityDecl);
2016
template <class B, WorkingCharacterSet W>
2017
void ParsingEngine<B, W>::Parse_Notationdecl (){
2020
Advance(8); /* Skip "NOTATION. */
2023
int nameID = Parse_Name();
2025
int notationID = Parser_Interface<W>::model_info->GlobalNotationTable[nameID];
2026
/* notationID == -1: used but not yet defined; == 0: new, > 0 prev. defined */
2027
if(notationID <= 0){
2028
Parser_Interface<W>::model_info->GlobalNotationTable[nameID]=++(Parser_Interface<W>::model_info->globalNotationCount);
2029
notationID = Parser_Interface<W>::model_info->globalNotationCount;
2031
else /*Duplicate notation name!*/
2032
Validity_Error(vErr_NoDuplicateTokens);
2033
Notation_info * this_info = new Notation_info;
2035
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
2037
if (AtChar<B::Base,'>'>(cur())) {
2041
Syntax_Error(NT_NotationDecl);
2044
template <class B, WorkingCharacterSet W>
2045
void ParsingEngine<B, W>::requireWS(){
2047
int old_abspos = AbsPos();
2049
if(old_abspos == AbsPos())
2053
template <class B, WorkingCharacterSet W>
2054
void ParsingEngine<B, W>::Parse_AttValue(){
2056
int quoteCh = cur()[0];
2057
Advance(1); /* Skip " or ' */
2060
while (cur()[0] != quoteCh){
2061
if (at_CharRef_Start<B::Base>(cur())){
2065
else if (AtChar<B::Base,'&'>(cur())){
2069
else if (AtQuote<B::Base>(cur())) {
2073
else /* if (AtChar<B::Base,'<'>(cur())) */
2074
WF_Error(wfErr_CleanAttrVals);
2079
template <class B, WorkingCharacterSet W>
2080
void ParsingEngine<B, W>::Parse_GEntityValue(GEntity_info * this_info){
2082
int quoteCh = cur()[0];
2083
Advance(1); /* Skip " or ' */
2084
this_info->is_simple = true;
2085
int quot_start = AbsPos();
2088
replText = copy_string(GetCodeUnitPtr(quot_start),AbsPos()-quot_start);
2089
while (cur()[0] != quoteCh){
2090
if (at_CharRef_Start<B::Base>(cur())){
2091
strcat (replText,Replace_CharRef());
2092
quot_start = AbsPos();
2095
else if (AtQuote<B::Base>(cur())) {
2096
quot_start = AbsPos();
2100
else if (at_EOF()) {
2101
Syntax_Error(NT_EntityValue);
2103
else { /* '<' or '&' found */
2104
quot_start = AbsPos();
2107
this_info->is_simple = false;
2109
replText = cat_string (replText,(char *)GetCodeUnitPtr(quot_start), strlen(replText), AbsPos()-quot_start);
2111
this_info->ReplacementText = replText;
2115
template <class B, WorkingCharacterSet W>
2116
char * ParsingEngine<B, W>::Replace_EntityRef(bool& is_simple){
2118
int nameID = Parse_Name();
2119
if (AtChar<B::Base,';'>(cur()))
2122
Syntax_Error(NT_EntityValue);
2123
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
2125
WF_Error(wfErr_wf_entdeclared);
2127
if (Parser_Interface<W>::model_info->GEntityData[entityID-1]->is_simple == false)
2129
return Parser_Interface<W>::model_info->GEntityData[entityID-1]->ReplacementText;
2134
template <class B, WorkingCharacterSet W>
2135
void ParsingEngine<B, W>::Parse_PEntityValue(PEntity_info * this_info){
2136
fprintf(stderr,"parsing of parameter entity value has not been completed yet.\n");
2140
template <class B, WorkingCharacterSet W>
2141
char * ParsingEngine<B, W>::Replace_CharRef(){
2143
fprintf(stderr,"Replacement of Character Reference has not been completed yet.\n");
2147
template <class B, WorkingCharacterSet W>
2148
void ParsingEngine<B, W>::Parse_Prolog(){
2150
int old_pos = AbsPos();
2151
while (!at_DOCTYPE_start<B::Base>(cur())) {
2152
text_or_markup_start = AbsPos();
2153
if (at_Comment_Start<B::Base>(cur()))
2155
else if (at_PI_Start<B::Base>(cur()))
2158
Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2165
while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
2166
text_or_markup_start = AbsPos();
2167
if (at_Comment_Start<B::Base>(cur()))
2173
Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2176
template <class B, WorkingCharacterSet W>
2177
void ParsingEngine<B, W>::Parse_ExtSubsetDecl() {
2179
int start_pos=AbsPos();
2181
if(at_condSect_start<B::Base>(cur())){
2184
if (at_INCLUDE<B::Base>(cur())){
2187
if(AtChar<B::Base,'['>(cur())){
2189
Parse_ExtSubsetDecl();
2190
if(at_CDATA_End<B::Base>(cur()))
2192
else Syntax_Error(NT_includeSect);
2194
else Syntax_Error(NT_includeSect);
2196
else if (at_IGNORE<B::Base>(cur())){
2199
if(AtChar<B::Base,'['>(cur())){
2200
int section_depth=1;
2203
ScanTextTo(MarkupStart);
2204
if(at_condSect_start<B::Base>(cur())){
2208
else if(at_CDATA_End<B::Base>(cur())){
2214
if(section_depth==0) return;
2216
Syntax_Error(NT_ignoreSectContents);
2218
else Syntax_Error(NT_ignoreSect);
2220
else Syntax_Error(NT_conditionalSect);
2222
else if (AtChar<B::Base,'%'>(cur()))
2223
Parse_PEReference();
2224
else if (at_PI_Start<B::Base>(cur())) {
2227
else if (at_Comment_Start<B::Base>(cur())) {
2230
else if (AtChar<B::Base,'<'>(cur())){
2233
if(AtChar<B::Base,'!'>(cur())){
2235
if(at_ELEMENT<B::Base>(cur()))
2236
Parse_Elementdecl();
2237
else if(at_ATTLIST<B::Base>(cur()))
2238
Parse_AttlistDecl();
2239
else if(at_ENTITY<B::Base>(cur()))
2241
else if(at_NOTATION<B::Base>(cur()))
2242
Parse_Notationdecl();
2244
Syntax_Error(NT_markupdecl);
2248
Syntax_Error(NT_markupdecl);
2251
Syntax_Error(NT_extSubsetDecl);
2254
ExtSubsetDecl_action(GetCodeUnitPtr(start_pos), LengthFrom(start_pos));
2257
/* Parse a valid start or empty element tag. */
2258
template <class B, WorkingCharacterSet W>
2259
inline int ParsingEngine<B, W>::Parse_ValidStartTag (bool& is_emptyStartTag){
2262
int att_name_end, att_val_end;
2263
unsigned char quoteCh;
2266
int nameID = Parse_Name();
2267
int elemID = Parser_Interface<W>::model_info->GlobalElementTable[nameID];
2269
Validity_Error(vErr_elementvalid);
2271
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
2272
/* The following test optimizes the most common case of a
2273
start tag with no attributes. */
2274
if (AtChar<B::Base,'>'>(cur())) {
2276
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2280
if (AtChar<B::Base,'>'>(cur())) {
2282
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2284
else if (at_EmptyElementDelim<B::Base>(cur())) {
2286
is_emptyStartTag = true;
2287
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2290
/* Must be an attribute-value pair or error. */
2291
att_name_start = AbsPos();
2292
int att_nameID = Parse_Name();
2293
#if (not defined(OMISSION)) or (OMISSION != ATTRIBUTE_UNIQUENESS)
2294
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
2295
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
2297
if (LastAttOccurrence[attID] > text_or_markup_start) {
2298
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
2302
LastAttOccurrence[attID] = att_name_start;
2304
/* The following optimized tests handle the frequently occurring
2305
case that there are no blanks on either side of the equals sign.
2306
In many cases, the very first test handles 100% of actual
2307
attribute-value pairs encountered. */
2308
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
2311
if (!AtChar<B::Base,'='>(cur())) {
2312
Syntax_Error(NT_STag);
2317
if (!AtQuote<B::Base>(cur())) {
2318
Syntax_Error(NT_STag);
2322
att_val_start = AbsPos()+1;
2324
att_val_end = AbsPos()-1;
2325
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
2326
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2327
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2330
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2331
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2333
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
2334
if (AtChar<B::Base,'>'>(cur())) {
2336
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2339
else if (at_EmptyElementDelim<B::Base>(cur())) {
2341
is_emptyStartTag = true;
2342
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2346
if (AtChar<B::Base,'>'>(cur())) {
2348
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2351
else if (at_EmptyElementDelim<B::Base>(cur())) {
2353
is_emptyStartTag = true;
2354
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2357
else if (AbsPos() == att_val_end + 1) {
2358
/* No WS following att value */
2359
Syntax_Error(NT_STag);
2367
template <class B, WorkingCharacterSet W>
2368
int ParsingEngine<B, W>::Parse_ValidElement() {
2369
bool is_emptyStartTag = false;
2370
int nameID = Parse_ValidStartTag(is_emptyStartTag);
2372
printf("Parse_ValidElement: nameID = %d, name = %s, is_emptyStartTag=%i\n",nameID, Parser_Interface<W>::model_info->symbol_table->Get_UTF8_name(nameID), is_emptyStartTag);
2374
ContentModel * cm = Parser_Interface<W>::model_info->ContentModelData[nameID];
2375
switch (cm->cm_type) {
2377
if (!is_emptyStartTag) {
2378
if (at_EndTag_Start<B::Base>(cur())) {
2379
Parse_WF_EndTag(nameID);
2382
Validity_Error(vErr_elementvalid);
2387
if (!is_emptyStartTag) {
2389
Parse_WF_EndTag(nameID);
2393
if (!is_emptyStartTag) {
2394
Parse_MixedContent(((CM_Mixed *) cm)->elements);
2395
Parse_WF_EndTag(nameID);
2399
CM_RegExp * cre = (CM_RegExp *) cm;
2400
int content_state = 0;
2401
if (!is_emptyStartTag) {
2402
Parse_ValidContent(cre, content_state);
2404
printf("Final content_state = %i, nameID = %i\n", content_state, nameID);
2406
Parse_WF_EndTag(nameID);
2408
if (cre->transition_map[content_state][0]==0) {
2409
Validity_Error(vErr_elementvalid);
2415
template <class B, WorkingCharacterSet W>
2416
void ParsingEngine<B, W>::Parse_ValidContent(CM_RegExp * cre, int & cur_state) {
2419
/* If non-null report WS WS_action()? */
2420
text_or_markup_start = AbsPos();
2421
if (at_EndTag_Start<B::Base>(cur())) {
2424
else if (at_ElementTag_Start<B::Base>(cur())) {
2425
int nameID = Parse_ValidElement();
2427
printf("Content model state transition %i", cur_state);
2429
cur_state = cre->transition_map[cur_state][nameID];
2431
printf("-> %i\n", cur_state);
2434
else if (at_Comment_Start<B::Base>(cur())) {
2437
else if (at_PI_Start<B::Base>(cur())) {
2440
else if (AtChar<B::Base,'&'>(cur())) {
2441
Parse_ValidEntityRef(cre, cur_state);
2443
printf("EntityRef complete, cur_state = %i\n", cur_state);
2447
else if (at_EOF()) {
2450
else if (AtChar<B::Base,'<'>(cur())) {
2451
Syntax_Error(NT_markupdecl);
2454
Validity_Error(vErr_elementvalid);
2460
template <class B, WorkingCharacterSet W>
2461
void ParsingEngine<B, W>::Parse_AnyContent() {
2463
text_or_markup_start = AbsPos();
2464
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2465
if (at_ElementTag_Start<B::Base>(cur())) {
2466
text_if_nonnull_action(false);
2467
int nameID = Parse_ValidElement();
2469
else if (at_EndTag_Start<B::Base>(cur())) {
2470
text_if_nonnull_action(false);
2473
else if (at_Comment_Start<B::Base>(cur())) {
2474
text_if_nonnull_action(false);
2477
else if (at_CharRef_Start<B::Base>(cur())) {
2478
text_if_nonnull_action(true);
2481
else if (AtChar<B::Base,'&'>(cur())) {
2482
text_if_nonnull_action(true);
2483
Parse_EntityRef_inAnyContent();
2485
else if (at_CDATA_Start<B::Base>(cur())) {
2486
text_if_nonnull_action(true);
2489
else if (at_PI_Start<B::Base>(cur())) {
2490
text_if_nonnull_action(false);
2493
else if (at_CDATA_End<B::Base>(cur())) {
2494
text_if_nonnull_action(true);
2496
Syntax_Error(NT_CharData);
2498
else if (at_EOF()) {
2499
text_if_nonnull_action(false);
2502
else if (AtChar<B::Base,'<'>(cur())) {
2503
Syntax_Error(NT_markupdecl);
2511
template <class B, WorkingCharacterSet W>
2512
void ParsingEngine<B, W>::Parse_MixedContent(symbol_set_t elems) {
2514
text_or_markup_start = AbsPos();
2515
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2516
/* if (AtChar<B::Base,'<'>(cur())) {
2517
text_if_nonnull_action();
2518
Parse_Markup<B, W>();
2520
if (at_ElementTag_Start<B::Base>(cur())) {
2521
text_if_nonnull_action(false);
2522
int nameID = Parse_ValidElement();
2523
if (elems[nameID] == 0) {
2524
Validity_Error(vErr_elementvalid);
2527
else if (at_EndTag_Start<B::Base>(cur())) {
2528
text_if_nonnull_action(false);
2531
else if (at_Comment_Start<B::Base>(cur())) {
2532
text_if_nonnull_action(false);
2535
else if (at_CharRef_Start<B::Base>(cur())) {
2536
text_if_nonnull_action(true);
2539
else if (AtChar<B::Base,'&'>(cur())) {
2540
text_if_nonnull_action(true);
2541
Parse_EntityRef_inMixed(elems);
2543
else if (at_CDATA_Start<B::Base>(cur())) {
2544
text_if_nonnull_action(true);
2547
else if (at_PI_Start<B::Base>(cur())) {
2548
text_if_nonnull_action(false);
2551
else if (at_CDATA_End<B::Base>(cur())) {
2552
text_if_nonnull_action(true);
2554
Syntax_Error(NT_CharData);
2556
else if (at_EOF()) {
2557
text_if_nonnull_action(false);
2560
else if (AtChar<B::Base,'<'>(cur())) {
2561
Syntax_Error(NT_markupdecl);
2571
template <class B, WorkingCharacterSet W>
2572
int ParsingEngine<B, W>::Parse_Name() {
2573
int name_pos = AbsPos();
2575
int lgth = AbsPos()-name_pos;
2576
int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Name(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2577
if (nameID != 0) return nameID;
2579
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2580
char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2581
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2582
return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved();
2587
// int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Name() {
2588
// int name_pos = AbsPos();
2589
// ScanTo(NameFollow);
2590
// int lgth = AbsPos()-name_pos;
2591
// // int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2592
// // if (nameID != 0) return nameID;
2594
// int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2595
// char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2596
// byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2597
// return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved();
2601
// template <WorkingCharacterSet W>
2602
// inline int ParsingEngine<UTF8_Buffer, W>::Parse_Name() {
2603
// int name_pos = AbsPos();
2604
// ScanTo(NameFollow);
2605
// int lgth = AbsPos()-name_pos;
2606
// return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2610
inline int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Name() {
2611
int name_pos = AbsPos();
2613
int lgth = AbsPos()-name_pos;
2614
return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2617
template <class B, WorkingCharacterSet W>
2618
int ParsingEngine<B, W>::Parse_Nmtoken() {
2619
int name_pos = AbsPos();
2621
int lgth = AbsPos()-name_pos;
2622
int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Nmtoken(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2623
if (nameID != 0) return nameID;
2625
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2626
char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2627
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2628
return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2633
int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Nmtoken() {
2634
int name_pos = AbsPos();
2636
int lgth = AbsPos()-name_pos;
2637
// int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2638
// if (nameID != 0) return nameID;
2640
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2641
char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2642
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2643
return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2647
// int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Nmtoken() {
2648
// int name_pos = AbsPos();
2649
// ScanTo(NameFollow);
2650
// int lgth = AbsPos()-name_pos;
2651
// return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Nmtoken(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2654
template <class B, WorkingCharacterSet W>
2655
void ParsingEngine<B, W>::Parse_DocumentContent() {
2656
#if (VALIDATION_MODE == ON)
2658
Parse_ValidContent(Parser_Interface<W>::model_info->rootModel, cur_state);
2659
if (Parser_Interface<W>::model_info->rootModel->transition_map[cur_state][0]==0) {
2660
Validity_Error(vErr_elementvalid);
2663
#if (VALIDATION_MODE == OFF)
2666
while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
2667
if (at_Comment_Start<B::Base>(cur()))
2674
Syntax_Error(NT_document);
2677
Parser_Interface<W>::DocumentEnd_action();
2680
#ifdef MARKUP_PASS_CONTROL
2681
// Test routine as an alternative to MarkupPass.
2682
template <class B, WorkingCharacterSet W>
2683
void ParsingEngine<B, W>::ParseContent() {
2686
int charref_code = 0;
2687
int general_ref_code = 0;
2688
DocumentStart_action();
2689
bool is_emptyStartTag = false;
2691
text_or_markup_start = AbsPos();
2692
ScanTo(MarkupStart); /* '<', '&', or ']' for 0b11']]>' test */
2693
/* if (AtChar<B::Base,'<'>(cur())) {
2694
text_if_nonnull_action();
2695
Parse_Markup<B, W>();
2697
if (at_EndTag_Start<B::Base>(cur())) {
2698
end_code |= AbsPos();
2700
else if (AtChar<B::Base,'<'>(cur())) {
2701
start_code += AbsPos();
2703
else if (at_CharRef_Start<B::Base>(cur())) {
2706
else if (AtChar<B::Base,'&'>(cur())) {
2707
general_ref_code += 1;
2709
else if (at_EOF()) break;
2712
printf("Start_code: %i\n", start_code);
2713
printf("End_code: %i\n", end_code);
2714
printf("general_ref_code: %i\n", general_ref_code);
2715
printf("charref_code: %i\n", charref_code);
2716
DocumentEnd_action();
2720
#ifdef MARKUP_SORTING
2721
// Little endian codes for [&#/] stream.
2722
enum MarkupSortCodes {
2723
StartTagTwoBitCode = 0,
2724
EndTagTwoBitCode = 2,
2730
static inline int GetBitPair(SIMD_type * stream, int bit_posn) {
2731
return bitstream_segment_from(stream, bit_posn) & 3;
2734
template <class B, WorkingCharacterSet W>
2735
void ParsingEngine<B, W>::ParseContent() {
2736
/*vector<int> MarkupPositions[4];*/
2737
int MarkupPositions[4][BUFFER_SIZE];
2738
int MarkupCounts[4];
2741
int charref_code = 0;
2742
int general_ref_code = 0;
2744
DocumentStart_action();
2745
bool is_emptyStartTag = false;
2746
for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2747
text_or_markup_start = AbsPos();
2749
unsigned long segment = bitstream_segment_from(buf->item_stream[MarkupStart], buffer_rel_pos);
2750
//printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2752
buffer_rel_pos += cfzl(segment);
2753
text_or_markup_start = AbsPos();
2754
int markup_code = GetBitPair(buf->item_stream[AmpHashSlash], buffer_rel_pos);
2755
MarkupPositions[markup_code][MarkupCounts[markup_code]] = AbsPos();
2756
MarkupCounts[markup_code]++;
2760
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
2761
// printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2763
if (buffer_rel_pos >= buffer_limit_pos) {
2764
/* for (int i = 0; i < MarkupCounts[StartTagTwoBitCode]; i++) {
2765
start_code += MarkupPositions[StartTagTwoBitCode][i];
2767
for (int i = 0; i < MarkupCounts[EndTagTwoBitCode]; i++) {
2768
end_code |= MarkupPositions[EndTagTwoBitCode][i];
2770
for (int i = 0; i < MarkupCounts[GeneralRefCode]; i++) {
2771
general_ref_code += 1;
2773
for (int i = 0; i < MarkupCounts[CharRefCode]; i++) {
2776
/* printf("Start_code: %i\n", start_code);
2777
printf("End_code: %i\n", end_code);
2778
printf("general_ref_code: %i\n", general_ref_code);
2779
printf("charref_code: %i\n", charref_code);*/
2780
for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2781
if (buffer_rel_pos >= BUFFER_SIZE) {
2782
AdjustBufferEndForIncompleteSequences();
2783
Parser_Interface<W>::FinalizeBuffer_action();
2792
/* vector<int>::iterator i;
2793
for (i = MarkupPositions[StartTagTwoBitCode].begin(); i != MarkupPositions[StartTagTwoBitCode].end(); i++) {
2796
for (i = MarkupPositions[EndTagTwoBitCode].begin(); i != MarkupPositions[EndTagTwoBitCode].end(); i++) {
2799
for (i = MarkupPositions[GeneralRefCode].begin(); i != MarkupPositions[GeneralRefCode].end(); i++) {
2800
general_ref_code += 1;
2802
for (i = MarkupPositions[CharRefCode].begin(); i != MarkupPositions[CharRefCode].end(); i++) {
2805
printf("Start_code: %i\n", start_code);
2806
printf("End_code: %i\n", end_code);
2807
printf("general_ref_code: %i\n", general_ref_code);
2808
printf("charref_code: %i\n", charref_code);
2809
DocumentEnd_action();