1
/* engine.c - Parabix XML parsing engine.
2
Copyright (c) 2007, 2008, Robert D. Cameron and Dan Lin.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
13
#include "contentmodel.h"
14
#include "contentmodel.c"
15
#include "xml_error.h"
25
inline char * copy_string (unsigned char * s, int lgth){
26
char * d = new char[lgth+1];
27
memcpy(d, (char *)s,lgth);
32
inline char * cat_string (char * s1, char * s2, int lgth1, int lgth2){
33
char * s = new char[lgth1 + lgth2 + 1];
35
memcpy(&s[lgth1],s2,lgth2);
36
s[lgth1 + lgth2] = '\0';
41
template <WorkingCharacterSet W>
42
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * filename) {
45
unsigned char signature[4];
47
infile = fopen(filename, "rb");
49
fprintf(stderr, "Error: cannot open %s for input.\n", filename);
52
fread(signature,1,4,infile);
53
Entity_Info * e = new Entity_Info;
54
Model_Info * m = new Model_Info;
55
e->AnalyzeSignature(signature);
56
Byteplex * b = Byteplex::ByteplexFactory(e, infile);
57
b->InitializeBuffer(signature,4);
59
b->PreparePseudoASCII_Stream();
61
if (e->code_unit_base == ASCII) {
62
XML_Decl_Parser<ASCII> decl_parser(b);
63
decl_parser.ReadXMLInfo(*e);
64
if (e->code_unit_size == SingleByte) {
65
if (!(e->has_encoding_decl) || at_UTF_8(e->encoding))
66
return new ParsingEngine< UTF8_Buffer, W>(e, m, b, false);
67
else return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
69
else if (e->code_unit_size == DoubleByte) {
70
return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
72
else if (e->code_unit_size == QuadByte) {
73
return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
76
else /* if (e->code_unit_base == EBCDIC) */ {
77
XML_Decl_Parser<EBCDIC> decl_parser(b);
78
decl_parser.ReadXMLInfo(*e);
79
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
83
template <WorkingCharacterSet W>
84
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * filename, Model_Info * m) {
87
unsigned char signature[4];
89
infile = fopen(filename, "rb");
91
fprintf(stderr, "Error: cannot open %s for input.\n", filename);
94
fread(signature,1,4,infile);
95
Entity_Info * e = new Entity_Info;
96
e->AnalyzeSignature(signature);
97
Byteplex * b = Byteplex::ByteplexFactory(e, infile);
98
b->InitializeBuffer(signature,4);
100
b->PreparePseudoASCII_Stream();
101
if (e->code_unit_base == ASCII) {
102
XML_Decl_Parser<ASCII> decl_parser(b);
103
decl_parser.ReadXMLInfo(*e);
104
if (e->code_unit_size == SingleByte) {
105
return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, true);
107
else if (e->code_unit_size == DoubleByte) {
108
return new ParsingEngine<U16_Buffer, W>(e, m, b, true);
110
else if (e->code_unit_size == QuadByte) {
111
return new ParsingEngine<U32_Buffer, W>(e, m, b, true);
114
else /* if (e->code_unit_base == EBCDIC) */ {
115
XML_Decl_Parser<EBCDIC> decl_parser(b);
116
decl_parser.ReadXMLInfo(*e);
117
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, true);
121
template <WorkingCharacterSet W>
122
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * byte_buffer, int byte_count, Entity_Info * e1, Model_Info * m){
123
Entity_Info * e = new Entity_Info;
125
e->code_unit_base=e1->code_unit_base;
126
e->code_unit_size=e1->code_unit_size;
127
e->version=e1->version;
128
e->encoding=e1->encoding;
129
e->content_start = 0;
130
Byteplex * b = Byteplex::ByteplexFactory(e, (unsigned char *) byte_buffer, byte_count);
132
b->PreparePseudoASCII_Stream();
133
if (e->code_unit_base == ASCII) {
134
if (e->code_unit_size == SingleByte) {
135
return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
137
else if (e->code_unit_size == DoubleByte) {
138
return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
140
else if (e->code_unit_size == QuadByte) {
141
return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
144
else /* if (e->code_unit_base == EBCDIC) */ {
145
return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
149
template <WorkingCharacterSet W>
150
Parser_Interface<W>::~Parser_Interface() {
154
template <WorkingCharacterSet W>
155
bool Parser_Interface<W>::has_ByteOrderMark() {
156
return entity_Info->BOM_units > 0;
159
template <WorkingCharacterSet W>
160
XML_version Parser_Interface<W>::get_version() {
161
return entity_Info->version;
164
template <WorkingCharacterSet W>
165
XML_standalone Parser_Interface<W>::standalone_status() {
166
return entity_Info->standalone;
169
template <WorkingCharacterSet W>
170
bool Parser_Interface<W>::has_EncodingDecl() {
171
return entity_Info->has_encoding_decl;
174
template <WorkingCharacterSet W>
175
unsigned char * Parser_Interface<W>::get_Encoding() {
176
return entity_Info->encoding;
179
template <class B, WorkingCharacterSet W>
180
inline unsigned char * ParsingEngine<B, W>::GetCodeUnitPtr(int pos) {
181
int rel_pos = pos - buffer_base_pos;
182
return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) B::Size];
186
inline unsigned char * ParsingEngine<UTF8_Buffer, UTF_8>::GetCodeUnitPtr(int pos) {
187
int rel_pos = pos - buffer_base_pos;
188
return &((unsigned char *) (x8data))[rel_pos];
194
template <class B, WorkingCharacterSet W>
195
ParsingEngine<B, W>::ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external) : Parser_Interface<W> () {
196
Parser_Interface<W>::entity_Info = e;
197
Parser_Interface<W>::model_info = m;
200
// m->symbol_table = new Symbol_Table();
201
// m->SimpleEntity("lt", "<");
202
// m->SimpleEntity("gt", ">");
203
// m->SimpleEntity("amp", "&");
204
// m->SimpleEntity("quot", "\"");
205
// m->SimpleEntity("apos", "'");
206
m->symbol_table->version = e->version;
208
StrictWellFormedness=false;
209
LastAttOccurrence.assign(m->globalAttributeCount+1, 0);
212
bitplex = new Bitplex;
213
buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
215
/* Install sentinels for every lexical item stream*/
216
#ifdef TEMPLATED_SIMD_LIB
217
BitBlock sentinel_value = simd<1>::constant<1>();
219
#ifndef TEMPLATED_SIMD_LIB
220
BitBlock sentinel_value = simd_const_1(1);
223
#ifdef OPTIMIZE_SHORT_SCAN
224
sentinel_value = sisd_sfli(sentinel_value, 8*sizeof(unsigned long));
227
for (int j = minLexicalItem; j < LexicalItemCount; j++) {
228
buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
232
buffer_rel_pos = e->content_start;
233
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
234
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
235
x8data = byteplex->x8data;
236
lexer = Lexer<B::Base>::LexerFactory(e, buf);
237
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
238
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
241
template <class B, WorkingCharacterSet W>
242
ParsingEngine<B, W>::~ParsingEngine() {
243
// How do we do this? Parser_Interface<W>::model_info->~Model_Info();
244
Parser_Interface<W>::entity_Info->~Entity_Info();
247
simd_delete((SIMD_type *) buf);
251
template <class B, WorkingCharacterSet W>
252
void ParsingEngine<B, W>::AdvanceBuffers(){
253
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
254
code_clocker->start_interval();
257
int advance_amt = text_or_markup_start - buffer_base_pos;
258
advance_amt &= -PACKSIZE; // maintain alignment
259
byteplex->AdvanceInputBuffer(advance_amt);
260
buffer_base_pos += advance_amt;
261
buffer_rel_pos -= advance_amt;
262
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
263
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
264
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
265
code_clocker->start_interval();
267
byteplex->DoByteplex();
268
byteplex->PreparePseudoASCII_Stream();
269
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
270
code_clocker->end_interval(buffer_limit_pos);
272
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
273
code_clocker->start_interval();
275
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
276
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
277
code_clocker->end_interval(buffer_limit_pos);
279
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
280
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
281
code_clocker->end_interval(buffer_limit_pos);
287
void ParsingEngine<U16_Buffer, UTF_16>::AdvanceBuffers(){
288
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
289
code_clocker->start_interval();
292
int advance_amt = text_or_markup_start - buffer_base_pos;
293
advance_amt &= -PACKSIZE; // maintain alignment
294
byteplex->AdvanceInputBuffer(advance_amt);
295
buffer_base_pos += advance_amt;
296
buffer_rel_pos -= advance_amt;
297
buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
298
int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
299
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
300
code_clocker->start_interval();
302
byteplex->DoByteplex();
303
if (at_UTF_16(Parser_Interface<UTF_16>::entity_Info->encoding)) ((U16_Buffer *) byteplex)->Validate_UTF16();
304
byteplex->PreparePseudoASCII_Stream();
305
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
306
code_clocker->end_interval(buffer_limit_pos);
308
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
309
code_clocker->start_interval();
311
bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
312
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
313
code_clocker->end_interval(buffer_limit_pos);
315
lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
316
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
317
code_clocker->end_interval(buffer_limit_pos);
322
template <class B, WorkingCharacterSet W>
323
inline unsigned char * ParsingEngine<B, W>::cur() const {
324
return &((unsigned char *) x8data)[buffer_rel_pos];
327
template <class B, WorkingCharacterSet W>
328
inline int ParsingEngine<B, W>::AbsPos() const {
329
return buffer_base_pos + buffer_rel_pos;
332
template <class B, WorkingCharacterSet W>
333
inline int ParsingEngine<B, W>::LengthFrom(int start_pos) const {
334
return buffer_base_pos + buffer_rel_pos - start_pos;
339
template <class B, WorkingCharacterSet W>
340
inline int ParsingEngine<B, W>::BufferRelPos() const {
341
return buffer_rel_pos;
345
template <class B, WorkingCharacterSet W>
346
inline bool ParsingEngine<B, W>::at_EOF() const {
347
return (buffer_rel_pos >= buffer_limit_pos) &&
348
(buffer_limit_pos < BUFFER_SIZE);
351
//template <class B, WorkingCharacterSet W>
352
//inline void ParsingEngine<B, W>::Advance(int n) {
353
// buffer_rel_pos += n;
354
// if (buffer_rel_pos >= BUFFER_SIZE) {
355
// Parser_Interface<W>::FinalizeBuffer_action();
362
buffer_rel_pos += n; \
363
if (buffer_rel_pos >= BUFFER_SIZE) { \
364
Parser_Interface<W>::FinalizeBuffer_action();\
370
template <class B, WorkingCharacterSet W>
371
void ParsingEngine<B, W>::AdjustBufferEndForIncompleteSequences() {
375
void ParsingEngine<UTF8_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
376
if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
377
else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
378
else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
382
void ParsingEngine<U16_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
383
unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
384
if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
388
void ParsingEngine<UTF8_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
389
if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
390
else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
391
else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
395
void ParsingEngine<U16_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
396
unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
397
if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
402
#ifdef OPTIMIZE_SHORT_SCAN
404
// Inline ScanTo with unrolled first test that should almost always
405
// succeed for short scans.
406
#define ScanTo(item) \
408
unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
409
if (segment != 0) buffer_rel_pos += cfzl(segment);\
411
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
412
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
413
while (buffer_rel_pos >= BUFFER_SIZE) {\
414
buffer_rel_pos = BUFFER_SIZE;\
415
AdjustBufferEndForIncompleteSequences();\
416
Parser_Interface<W>::FinalizeBuffer_action();\
418
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
423
// The following version seems cleaner, but measured mispredictions are higher
424
// #define ScanTo(item) \
426
// unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
427
// while (unlikely (segment == 0)) {\
428
// buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
429
// if (buffer_rel_pos >= BUFFER_SIZE) {\
430
// buffer_rel_pos = BUFFER_SIZE;\
431
// AdjustBufferEndForIncompleteSequences();\
432
// Parser_Interface<W>::FinalizeBuffer_action();\
433
// AdvanceBuffers();\
435
// segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
437
// buffer_rel_pos += cfzl(segment);\
440
// #define ScanTextTo(item) \
442
// unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
443
// text_or_markup_start = AbsPos();\
444
// if (segment != 0) buffer_rel_pos += cfzl(segment);\
446
// buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
447
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
448
// while (buffer_rel_pos >= BUFFER_SIZE) {\
449
// buffer_rel_pos = BUFFER_SIZE;\
450
// AdjustBufferEndForIncompleteSequences();\
451
// Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);\
452
// text_or_markup_start = AbsPos();\
453
// Parser_Interface<W>::FinalizeBuffer_action();\
454
// AdvanceBuffers();\
455
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
460
template <class B, WorkingCharacterSet W>
461
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
462
text_or_markup_start = AbsPos();
463
unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);
464
if (segment != 0) buffer_rel_pos += cfzl(segment);
466
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
467
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
468
while (buffer_rel_pos >= BUFFER_SIZE) {
469
buffer_rel_pos = BUFFER_SIZE;
470
AdjustBufferEndForIncompleteSequences();
471
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
472
text_or_markup_start = AbsPos();
473
Parser_Interface<W>::FinalizeBuffer_action();
475
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
482
#ifndef OPTIMIZE_SHORT_SCAN
484
// #define ScanTo(item) \
486
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
487
// while (buffer_rel_pos >= BUFFER_SIZE) {\
488
// AdjustBufferEndForIncompleteSequences();\
489
// Parser_Interface<W>::FinalizeBuffer_action();\
490
// AdvanceBuffers();\
491
// buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
496
template <class B, WorkingCharacterSet W>
497
inline void ParsingEngine<B, W>::ScanTo(int item) {
498
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
499
while (buffer_rel_pos >= BUFFER_SIZE) {
500
AdjustBufferEndForIncompleteSequences();
501
Parser_Interface<W>::FinalizeBuffer_action();
503
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
507
template <class B, WorkingCharacterSet W>
508
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
509
text_or_markup_start = AbsPos();
510
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
511
while (buffer_rel_pos >= BUFFER_SIZE) {
512
AdjustBufferEndForIncompleteSequences();
513
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
514
text_or_markup_start = AbsPos();
515
Parser_Interface<W>::FinalizeBuffer_action();
517
buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
522
template <class B, WorkingCharacterSet W>
523
void ParsingEngine<B, W>::WF_Error (XML_Constraint errCode) {
524
printf("Error at position %i in input.\n", AbsPos());
525
ShowConstraintError(errCode);
527
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
531
template <class B, WorkingCharacterSet W>
532
void ParsingEngine<B, W>::Validity_Error (XML_Constraint errCode) {
533
printf("Error at position %i in input.\n", AbsPos());
534
ShowConstraintError(errCode);
536
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
539
template <class B, WorkingCharacterSet W>
540
void ParsingEngine<B, W>::Syntax_Error (XML_NonTerminal errNT) {
541
printf("Error at position %i in input.\n", AbsPos());
542
ShowSyntaxError(errNT);
544
// Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
548
/* Parse a comment beginning "<!--" */
549
template <class B, WorkingCharacterSet W>
550
void ParsingEngine<B, W>::Parse_Comment() {
552
Advance(4); /* Skip "<!--". */
554
while (!at_DoubleHyphen<B::Base>(cur())) {
556
Syntax_Error(NT_CDSect);
557
Advance(2); /* Skip hyphen-nonhyphen pair */
560
if (at_Comment_End<B::Base>(cur())) {
561
Advance(3); /* Skip "-->". */
562
Comment_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
565
Advance(2); /* "--" */
566
Syntax_Error(NT_Comment);
570
/* Parse an end tag beginning "</" */
571
template <class B, WorkingCharacterSet W>
572
inline void ParsingEngine<B, W>::Parse_EndTag() {
573
Advance(2); /* Skip "</". */
574
int nameID = Parse_Name();
575
if (AtChar<B::Base,'>'>(cur())) {
577
EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
581
if (AtChar<B::Base,'>'>(cur())) {
583
EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
585
else Syntax_Error(NT_ETag);
589
/* Parse a CDATA section beginning "<![CDATA". */
590
template <class B, WorkingCharacterSet W>
591
void ParsingEngine<B, W>::Parse_CDATA() {
592
Advance(8); /* Skip "<![CDATA". */
593
if (!AtChar<B::Base,'['>(cur())) {
594
Syntax_Error(NT_CDStart);
598
CDATA_start_action(GetCodeUnitPtr(text_or_markup_start));
599
text_or_markup_start = AbsPos();
600
ScanTextTo(CD_End_check);
601
while (!at_CDATA_End<B::Base>(cur())) {
603
Syntax_Error(NT_CDSect);
605
ScanTextTo(CD_End_check);
607
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
608
Advance(3); /* Skip "]]>". */
609
CDATA_end_action(GetCodeUnitPtr(AbsPos()));
613
template <class B, WorkingCharacterSet W>
614
void ParsingEngine<B, W>::Parse_EntityRef() {
615
Advance(1); // skip "&"
616
int nameID = Parse_Name(); /* Name delimiter */
617
if (!AtChar<B::Base,';'>(cur())) {
618
Syntax_Error(NT_Reference);
622
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
624
// The following code will replace Reference_Action.
625
GEntity_info * this_info;
626
Parser_Interface<W> * entity_parser;
627
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
629
WF_Error(wfErr_wf_entdeclared);
631
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
632
if (this_info->is_external){
634
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
635
WF_Error(wfErr_NoExternalRefs);
637
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
638
entity_parser->Parse_WF_Content();
639
if(!entity_parser->at_EOF())
640
Syntax_Error(NT_content);
641
entity_parser->~Parser_Interface<W>();
645
if (this_info->is_simple == true);
646
// printf("Entity is %s\n",this_info->ReplacementText);
648
// printf("Not a simple text: %s\n",this_info->ReplacementText);
649
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
650
entity_parser->Parse_WF_Content();
651
if(!entity_parser->at_EOF())
652
Syntax_Error(NT_content);
653
entity_parser->~Parser_Interface<W>();
661
template <class B, WorkingCharacterSet W>
662
void ParsingEngine<B, W>::Parse_EntityRef_inMixed(symbol_set_t elems) {
663
Advance(1); // skip "&"
664
int nameID = Parse_Name(); /* Name delimiter */
665
if (!AtChar<B::Base,';'>(cur())) {
666
Syntax_Error(NT_Reference);
670
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
672
// The following code will replace Reference_Action.
673
GEntity_info * this_info;
674
Parser_Interface<W> * entity_parser;
675
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
677
WF_Error(wfErr_wf_entdeclared);
679
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
680
if (this_info->is_external){
682
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
683
WF_Error(wfErr_NoExternalRefs);
685
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
686
entity_parser->Parse_MixedContent(elems);
687
if(!entity_parser->at_EOF())
688
Syntax_Error(NT_content);
689
entity_parser->~Parser_Interface<W>();
693
if (this_info->is_simple == true);
694
// printf("Entity is %s\n",this_info->ReplacementText);
696
// printf("Not a simple text: %s\n",this_info->ReplacementText);
697
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
698
entity_parser->Parse_MixedContent(elems);
699
if(!entity_parser->at_EOF())
700
Syntax_Error(NT_content);
701
entity_parser->~Parser_Interface<W>();
709
template <class B, WorkingCharacterSet W>
710
void ParsingEngine<B, W>::Parse_EntityRef_inAnyContent() {
711
Advance(1); // skip "&"
712
int nameID = Parse_Name(); /* Name delimiter */
713
if (!AtChar<B::Base,';'>(cur())) {
714
Syntax_Error(NT_Reference);
718
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
720
// The following code will replace Reference_Action.
721
GEntity_info * this_info;
722
Parser_Interface<W> * entity_parser;
723
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
725
WF_Error(wfErr_wf_entdeclared);
727
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
728
if (this_info->is_external){
730
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
731
WF_Error(wfErr_NoExternalRefs);
733
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
734
entity_parser->Parse_AnyContent();
735
if(!entity_parser->at_EOF())
736
Syntax_Error(NT_content);
737
entity_parser->~Parser_Interface<W>();
741
if (this_info->is_simple == true);
742
// printf("Entity is %s\n",this_info->ReplacementText);
744
// printf("Not a simple text: %s\n",this_info->ReplacementText);
745
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
746
entity_parser->Parse_AnyContent();
747
if(!entity_parser->at_EOF())
748
Syntax_Error(NT_content);
749
entity_parser->~Parser_Interface<W>();
757
template <class B, WorkingCharacterSet W>
758
void ParsingEngine<B, W>::Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state) {
759
Advance(1); // skip "&"
760
int nameID = Parse_Name(); /* Name delimiter */
761
if (!AtChar<B::Base,';'>(cur())) {
762
Syntax_Error(NT_Reference);
766
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
768
// The following code will replace Reference_Action.
769
GEntity_info * this_info;
770
Parser_Interface<W> * entity_parser;
771
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
773
WF_Error(wfErr_wf_entdeclared);
775
this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
776
if (this_info->is_external){
778
if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
779
WF_Error(wfErr_NoExternalRefs);
781
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
782
entity_parser->Parse_ValidContent(cre, cur_state);
783
if(!entity_parser->at_EOF())
784
Syntax_Error(NT_content);
785
entity_parser->~Parser_Interface<W>();
789
if (this_info->is_simple == true);
790
// printf("Entity is %s\n",this_info->ReplacementText);
792
// printf("Not a simple text: %s\n",this_info->ReplacementText);
793
entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
794
entity_parser->Parse_ValidContent(cre, cur_state);
795
if(!entity_parser->at_EOF())
796
Syntax_Error(NT_content);
797
entity_parser->~Parser_Interface<W>();
805
template <class B, WorkingCharacterSet W>
806
void ParsingEngine<B, W>::Parse_CharRef() {
807
Advance(2); // skip "&#"
809
if (AtChar<B::Base,'x'>(cur())) {
811
while(at_HexDigit<B::Base>(cur())){
812
ch_val = HexVal<B::Base>(cur()[0]) + (ch_val<<4);
813
if (ch_val> 0x10FFFF )
814
WF_Error(wfErr_wf_Legalchar);
819
while(at_Digit<B::Base>(cur())){
820
ch_val = DigitVal<B::Base>(cur()[0]) + ch_val*10;
821
if (ch_val> 0x10FFFF )
822
WF_Error(wfErr_wf_Legalchar);
826
if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF))
827
WF_Error(wfErr_wf_Legalchar);
828
else if (Parser_Interface<W>::entity_Info->version != XML_1_1)
829
if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA)))
830
WF_Error(wfErr_wf_Legalchar);
832
if (!AtChar<B::Base,';'>(cur())) {
833
Syntax_Error(NT_CharRef);
837
Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
841
template <class B, WorkingCharacterSet W>
842
void ParsingEngine<B, W>::Parse_PI (){
844
Advance(2); /* Skip "<?". */
845
int target_start = AbsPos();
846
if (at_XxMmLll<B::Base>(cur())) {
847
nameID = Parse_Name();
848
if (AbsPos() - target_start == 3) Syntax_Error(NT_PI);
850
else nameID = Parse_Name();
851
PI_Target_action(GetCodeUnitPtr(target_start), LengthFrom(target_start));
852
if (!at_PI_End<B::Base>(cur())) requireWS();
854
while (!at_PI_End<B::Base>(cur())) {
860
Advance(2); /* Skip "?>". */
861
PI_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
864
/* Parse a start or empty element tag. */
865
template <class B, WorkingCharacterSet W>
866
inline void ParsingEngine<B, W>::Parse_StartTag (){
869
int att_name_end, att_val_end;
870
unsigned char quoteCh;
872
int nameID = Parse_Name(); /* Name delimiter: WS, "/" or ">" */
873
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
874
/* The following test optimizes the most common case of a
875
start tag with no attributes. */
876
if (AtChar<B::Base,'>'>(cur())) {
878
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
882
if (AtChar<B::Base,'>'>(cur())) {
884
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
886
else if (at_EmptyElementDelim<B::Base>(cur())) {
888
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
891
/* Must be an attribute-value pair or error. */
892
att_name_start = AbsPos();
893
int att_nameID = Parse_Name();
894
att_name_end = AbsPos();
896
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
897
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
899
if (LastAttOccurrence[attID] > text_or_markup_start) {
900
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
904
LastAttOccurrence[attID] = att_name_start;
905
/* The following optimized tests handle the frequently occurring
906
case that there are no blanks on either side of the equals sign.
907
In many cases, the very first test handles 100% of actual
908
attribute-value pairs encountered. */
909
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
912
if (!AtChar<B::Base,'='>(cur())) {
913
Syntax_Error(NT_STag);
918
if (!AtQuote<B::Base>(cur())) {
919
Syntax_Error(NT_STag);
923
att_val_start = AbsPos()+1;
925
att_val_end = AbsPos()-1;
926
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
927
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
928
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
931
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
932
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
934
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
935
if (AtChar<B::Base,'>'>(cur())) {
937
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
940
else if (at_EmptyElementDelim<B::Base>(cur())) {
942
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
946
if (AtChar<B::Base,'>'>(cur())) {
948
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
951
else if (at_EmptyElementDelim<B::Base>(cur())) {
953
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
956
else if (AbsPos() == att_val_end + 1) {
957
/* No WS following att value */
958
Syntax_Error(NT_STag);
965
template <class B, WorkingCharacterSet W>
966
inline void ParsingEngine<B, W>::text_if_nonnull_action(bool more){
967
if (AbsPos() > text_or_markup_start) {
968
Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), more);
969
text_or_markup_start = AbsPos();
973
template <class B, WorkingCharacterSet W>
974
void ParsingEngine<B, W>::Parse_WF_EndTag(int nameID) {
976
int end_nameID = Parse_Name();
977
if(end_nameID != nameID)
978
WF_Error(wfErr_GIMatch);
979
if (AtChar<B::Base,'>'>(cur())) {
981
Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
985
if (AtChar<B::Base,'>'>(cur())) {
987
Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
989
else Syntax_Error(NT_ETag);
994
// void ParsingEngine<UTF8_Buffer, UTF_8>::Parse_WF_EndTag(int nameID) {
995
// Advance(2); /* Skip "</". */
997
// int name_start = AbsPos();
998
// // ScanTo(NameFollow);
999
// // int lgth = AbsPos()-name_start;
1001
// #if (not defined(OMISSION)) or ((OMISSION != END_TAG_MATCHING) and (OMISSION != NAME_LOOKUP))
1002
// char * start_elem_name = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_name(nameID);
1003
// int lgth = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_lgth(nameID);
1004
// char * end_elem_name = &((char *) x8data)[buffer_rel_pos];
1006
// #ifdef TEMPLATED_SIMD_LIB
1007
// BytePack byte_compare = simd<8>::eq(sisd_load_unaligned((BytePack *) end_elem_name),
1008
// sisd_load_unaligned((BytePack *) start_elem_name));
1010
// #ifndef TEMPLATED_SIMD_LIB
1011
// BytePack byte_compare = simd_eq_8(sisd_load_unaligned((BytePack *) end_elem_name),
1012
// sisd_load_unaligned((BytePack *) start_elem_name));
1015
// int expected_bits = ~(-1 << lgth);
1016
// if ((_mm_movemask_epi8(byte_compare) & expected_bits) != expected_bits) {
1017
// WF_Error(wfErr_GIMatch);
1021
// /* Must compare with bytes beyond the first 16. Set up to
1022
// compare 16 bytes at a time, with the first additional compare
1023
// overlapping with the first byte_compare. */
1024
// int pos = (lgth - 1) % PACKSIZE + 1;
1025
// #ifdef TEMPLATED_SIMD_LIB
1026
// byte_compare = simd_or(byte_compare, simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1027
// sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1029
// #ifndef TEMPLATED_SIMD_LIB
1030
// byte_compare = simd_or(byte_compare, simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1031
// sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1034
// while (pos < lgth) {
1035
// if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1036
// WF_Error(wfErr_GIMatch);
1038
// #ifdef TEMPLATED_SIMD_LIB
1039
// byte_compare = simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1040
// sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1042
// #ifndef TEMPLATED_SIMD_LIB
1043
// byte_compare = simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1044
// sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1048
// if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1049
// WF_Error(wfErr_GIMatch);
1055
// #if defined(OMISSION) and ((OMISSION == END_TAG_MATCHING) or (OMISSION == NAME_LOOKUP))
1056
// ScanTo(NameFollow);
1058
// // for(int i=0; i<lgth; i++) {
1059
// // if (start_elem_name[i] != end_elem_name[i])
1060
// // WF_Error(wfErr_GIMatch);
1062
// // if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
1064
// if (AtChar<ASCII,'>'>(cur())) {
1066
// Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1070
// if (AtChar<ASCII,'>'>(cur())) {
1072
// Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1074
// else Syntax_Error(NT_ETag);
1079
/* Parse a valid start or empty element tag. */
1080
template <class B, WorkingCharacterSet W>
1081
int ParsingEngine<B, W>::Parse_WF_StartTag (bool& is_emptyStartTag){
1084
int att_name_end, att_val_end;
1085
unsigned char quoteCh;
1088
#if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1089
int nameID = Parse_Name();
1091
#if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1095
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
1096
/* The following test optimizes the most common case of a
1097
start tag with no attributes. */
1098
if (AtChar<B::Base,'>'>(cur())) {
1100
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1104
if (AtChar<B::Base,'>'>(cur())) {
1106
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1108
else if (at_EmptyElementDelim<B::Base>(cur())) {
1110
is_emptyStartTag = true;
1111
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1114
/* Must be an attribute-value pair or error. */
1115
att_name_start = AbsPos();
1116
#if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1117
int att_nameID = Parse_Name();
1119
#if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1123
att_name_end = AbsPos();
1124
#if (not defined(OMISSION)) or ((OMISSION != ATTRIBUTE_UNIQUENESS) and (OMISSION != NAME_LOOKUP))
1125
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1126
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1128
if (LastAttOccurrence[attID] > text_or_markup_start) {
1129
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
1133
LastAttOccurrence[attID] = att_name_start;
1135
/* The following optimized tests handle the frequently occurring
1136
case that there are no blanks on either side of the equals sign.
1137
In many cases, the very first test handles 100% of actual
1138
attribute-value pairs encountered. */
1139
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
1142
if (!AtChar<B::Base,'='>(cur())) {
1143
Syntax_Error(NT_STag);
1148
if (!AtQuote<B::Base>(cur())) {
1149
Syntax_Error(NT_STag);
1153
att_val_start = AbsPos()+1;
1155
att_val_end = AbsPos()-1;
1156
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
1157
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1158
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1161
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1162
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1164
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
1165
if (AtChar<B::Base,'>'>(cur())) {
1167
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1170
else if (at_EmptyElementDelim<B::Base>(cur())) {
1172
is_emptyStartTag = true;
1173
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1177
if (AtChar<B::Base,'>'>(cur())) {
1179
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1182
else if (at_EmptyElementDelim<B::Base>(cur())) {
1184
is_emptyStartTag = true;
1185
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1188
else if (AbsPos() == att_val_end + 1) {
1189
/* No WS following att value */
1190
Syntax_Error(NT_STag);
1200
template <class B, WorkingCharacterSet W>
1201
void ParsingEngine<B, W>::Parse_WF_Element() {
1202
bool is_emptyStartTag = false;
1203
int nameID = Parse_WF_StartTag(is_emptyStartTag);
1205
printf("Parse_Element: nameID = %d, is_emptyStartTag=%i\n",nameID, is_emptyStartTag);
1207
if (!is_emptyStartTag) {
1209
Parse_WF_EndTag(nameID);
1214
template <class B, WorkingCharacterSet W>
1215
void ParsingEngine<B, W>::Parse_WF_Content() {
1217
text_or_markup_start = AbsPos();
1218
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1219
if (at_ElementTag_Start<B::Base>(cur())) {
1220
text_if_nonnull_action(false);
1223
else if (at_EndTag_Start<B::Base>(cur())) {
1224
text_if_nonnull_action(false);
1227
else if (at_Comment_Start<B::Base>(cur())) {
1228
text_if_nonnull_action(false);
1231
else if (at_CharRef_Start<B::Base>(cur())) {
1232
text_if_nonnull_action(true);
1235
else if (AtChar<B::Base,'&'>(cur())) {
1236
text_if_nonnull_action(true);
1239
else if (at_CDATA_Start<B::Base>(cur())) {
1240
text_if_nonnull_action(true);
1243
else if (at_PI_Start<B::Base>(cur())) {
1244
text_if_nonnull_action(false);
1247
else if (at_CDATA_End<B::Base>(cur())) {
1248
text_if_nonnull_action(true);
1250
Syntax_Error(NT_CharData);
1252
else if (at_EOF()) {
1253
text_if_nonnull_action(false);
1256
else if (AtChar<B::Base,'<'>(cur())) {
1257
Syntax_Error(NT_markupdecl);
1267
#ifndef MARKUP_PASS_CONTROL
1268
#ifndef MARKUP_SORTING
1269
template <class B, WorkingCharacterSet W>
1270
void ParsingEngine<B, W>::ParseContent() {
1271
Parser_Interface<W>::DocumentStart_action();
1272
bool is_emptyStartTag = false;
1274
text_or_markup_start = AbsPos();
1275
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1276
/* if (AtChar<B::Base,'<'>(cur())) {
1277
text_if_nonnull_action();
1278
Parse_Markup<B, W>();
1280
if (at_ElementTag_Start<B::Base>(cur())) {
1281
text_if_nonnull_action(false);
1284
else if (at_EndTag_Start<B::Base>(cur())) {
1285
text_if_nonnull_action(false);
1288
else if (at_Comment_Start<B::Base>(cur())) {
1289
text_if_nonnull_action(false);
1292
else if (at_CharRef_Start<B::Base>(cur())) {
1293
text_if_nonnull_action(true);
1296
else if (AtChar<B::Base,'&'>(cur())) {
1297
text_if_nonnull_action(true);
1300
else if (at_CDATA_Start<B::Base>(cur())) {
1301
text_if_nonnull_action(true);
1304
else if (at_PI_Start<B::Base>(cur())) {
1305
text_if_nonnull_action(false);
1308
else if (at_CDATA_End<B::Base>(cur())) {
1309
text_if_nonnull_action(true);
1311
Syntax_Error(NT_CharData);
1313
else if (at_EOF()) {
1314
text_if_nonnull_action(false);
1317
else if (AtChar<B::Base,'<'>(cur())) {
1318
Syntax_Error(NT_markupdecl);
1325
Parser_Interface<W>::DocumentEnd_action();
1330
template <class B, WorkingCharacterSet W>
1331
void ParsingEngine<B, W>::Parse_DocType (){
1333
int old_abspos, start_pos;
1335
start_pos = AbsPos();
1337
if (at_DOCTYPE_start<B::Base>(cur()))
1340
// printf("No Document definition!\n");
1344
int nameID = Parse_Name();
1346
old_abspos = AbsPos();
1348
if(at_SYSTEM<B::Base>(cur())||at_PUBLIC<B::Base>(cur())){
1349
Parser_Interface<W>::model_info->has_external_DTD = true;
1350
if(old_abspos == AbsPos())
1351
Syntax_Error(NT_doctypedecl);
1352
Parse_ExternalID(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info->external_DTD_pubidLiteral);
1353
Parser_Interface<W> * entity_parser;
1354
entity_parser = ParserFactory(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info);
1355
entity_parser->Parse_ExtSubsetDecl();
1356
entity_parser->~Parser_Interface<W>();
1358
else Parser_Interface<W>::model_info->has_external_DTD = false;
1361
if (AtChar<B::Base,'['>(cur())){
1364
if (AtChar<B::Base,']'>(cur()))
1367
Syntax_Error(NT_doctypedecl);
1371
if (AtChar<B::Base,'>'>(cur())){
1374
CRE_Seq * rslt = new CRE_Seq();
1375
rslt->subCMs.push_back(new CRE_Name(nameID));
1376
CM_RegExp * cre = new CM_RegExp();
1377
cre->content_re = rslt;
1379
int id_count = cre->content_re->Set_IDs(0);
1380
cre->content_re->Set_First_Map();
1381
symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1382
cre->content_re->follow_map[0] = id_count+1;
1384
cre->content_re->Set_Follow_Map(transition_map);
1385
transition_map[0] = cre->content_re->first_map;
1386
if (cre->content_re->matches_empty)
1387
transition_map[0][0]=id_count+1;
1389
cre -> transition_map = transition_map;
1391
Parser_Interface<W>::model_info->rootModel = cre;
1393
/* Check for notations that were used, but not defined by the end of the DTD. */
1394
#if (VALIDATION_MODE == ON)
1395
hash_map<int, int >::iterator j;
1396
for (j=Parser_Interface<W>::model_info->GlobalNotationTable.begin(); j!=Parser_Interface<W>::model_info->GlobalNotationTable.end(); j++) {
1397
if (j->second == -1)
1398
Validity_Error(vErr_notatn);
1403
Syntax_Error(NT_doctypedecl);
1406
template <class B, WorkingCharacterSet W>
1407
void ParsingEngine<B, W>::Parse_ExternalID (char *& systemLiteral, char *& pubidLiteral){
1408
int quot_start, lgth;
1409
if(at_SYSTEM<B::Base>(cur())){
1411
pubidLiteral = NULL;
1413
if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1414
quot_start = AbsPos()+1;
1415
Parse_SystemLiteral (); /* SystemLiteral */
1416
lgth = AbsPos() - quot_start - 1;
1417
systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1419
else if (at_PUBLIC<B::Base>(cur())){
1422
if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1423
quot_start = AbsPos()+1;
1424
Parse_PubidLiteral ();/* PubidLiteral */
1425
lgth = AbsPos() - quot_start - 1;
1426
pubidLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1427
systemLiteral = NULL;
1428
if (AtChar<B::Base, '>'>(cur())) return;
1430
if (AtQuote<B::Base>(cur())) {
1431
quot_start = AbsPos()+1;
1432
Parse_SystemLiteral ();/* SystemLiteral */
1433
lgth = AbsPos() - quot_start - 1;
1434
systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1438
Syntax_Error(NT_ExternalID);
1441
template <class B, WorkingCharacterSet W>
1442
void ParsingEngine<B, W>::Parse_SystemLiteral (){
1443
unsigned char quoteCh;
1444
if(AtQuote<B::Base>(cur())){
1449
while (cur()[0] != quoteCh){
1451
Syntax_Error(NT_SystemLiteral);
1458
template <class B, WorkingCharacterSet W>
1459
void ParsingEngine<B, W>::Parse_PubidLiteral (){
1460
unsigned char quoteCh;
1463
while (at_PubidChar<B::Base>(cur()) && (cur()[0] != quoteCh)) {
1466
if (cur()[0] != quoteCh){
1467
Syntax_Error(NT_PubidLiteral);
1472
template <class B, WorkingCharacterSet W>
1473
void ParsingEngine<B, W>::Parse_IntSubset (){
1477
text_or_markup_start = AbsPos();
1478
if (AtChar<B::Base,'%'>(cur()))
1479
Parse_PEReference();
1480
else if (at_PI_Start<B::Base>(cur())) {
1483
else if (at_Comment_Start<B::Base>(cur())) {
1486
else if (AtChar<B::Base,'<'>(cur())){
1488
if(AtChar<B::Base,'!'>(cur())){
1490
if (at_ELEMENT<B::Base>(cur()))
1491
Parse_Elementdecl();
1492
else if (at_ATTLIST<B::Base>(cur()))
1493
Parse_AttlistDecl();
1494
else if (at_ENTITY<B::Base>(cur()))
1496
else if (at_NOTATION<B::Base>(cur()))
1497
Parse_Notationdecl();
1499
Syntax_Error(NT_markupdecl);
1503
Syntax_Error(NT_markupdecl);
1505
else if (AtChar<B::Base,']'>(cur())){
1509
Syntax_Error(NT_intSubset);
1514
template <class B, WorkingCharacterSet W>
1515
void ParsingEngine<B, W>::Parse_PEReference (){
1517
Advance(1); /* Skip "%". */
1518
fprintf(stderr,"Parameter Reference has not been completed yet.\n");
1520
int nameID = Parse_Name();
1521
if (AtChar<B::Base,';'>(cur())) {
1523
PEReference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1524
PEntity_info * this_info;
1525
Parser_Interface<W> * entity_parser;
1526
int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
1528
WF_Error(wfErr_wf_entdeclared);
1530
this_info = Parser_Interface<W>::model_info->PEntityData[entityID-1];
1531
if (this_info->is_external){
1533
// if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
1534
// WF_Error(wfErr_NoExternalRefs);
1536
entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
1537
entity_parser->Parse_WF_Content();
1538
if(!entity_parser->at_EOF())
1539
Syntax_Error(NT_content);
1540
entity_parser->~Parser_Interface<W>();
1548
Syntax_Error(NT_PEReference);
1552
template <class B, WorkingCharacterSet W>
1553
void ParsingEngine<B, W>::Parse_Elementdecl (){
1555
Advance(7); /* Skip "<!ELEMENT". */
1558
int nameID = Parse_Name();
1559
int elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1563
/* Start parsing "contentspec"*/
1564
if (at_EMPTY<B::Base>(cur())) {
1566
cm = new CM_Empty();
1567
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1569
else if (at_ANY<B::Base>(cur())) {
1572
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1575
if (AtChar<B::Base,'('>(cur()))
1578
if (at_PCDATA<B::Base>(cur())){
1579
cm = Parse_RemainingMixed();
1580
Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1584
CM_RegExp * cre = new CM_RegExp;
1585
cre->content_re = Parse_RemainingChildren();
1587
int id_count = cre->content_re->Set_IDs(0);
1588
cre->content_re->Set_First_Map();
1589
symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1590
cre->content_re->follow_map[0] = id_count+1;
1592
cre->content_re->Set_Follow_Map(transition_map);
1593
transition_map[0] = cre->content_re->first_map;
1595
if (cre->content_re->matches_empty)
1596
transition_map[0][0]=id_count+1;
1598
cre -> transition_map = transition_map;
1600
Parser_Interface<W>::model_info->ContentModelData[nameID] = cre;
1606
if (AtChar<B::Base,'>'>(cur())) {
1610
Syntax_Error(NT_elementdecl);
1612
template <class B, WorkingCharacterSet W>
1613
ContentModel * ParsingEngine<B, W>::Parse_RemainingMixed (){
1614
CM_Mixed * r = new CM_Mixed();
1615
Advance(7); /* Skip "#PCDATA". */
1617
if (AtChar<B::Base,')'>(cur())){
1618
if (AtChar<B::Base,'*'>(cur())) {
1628
while (AtChar<B::Base,'|'>(cur())){
1631
int nameID = Parse_Name();
1632
r->elements[nameID] = ++k;
1635
if (at_Para_star<B::Base>(cur())) Advance(2);
1637
Syntax_Error(NT_Mixed);
1645
template <class B, WorkingCharacterSet W>
1646
Content_RE * ParsingEngine<B, W>::Parse_RemainingChildren (){
1647
Content_RE * c1 = Parse_Cp();
1648
Content_RE * r = c1;
1650
if(AtChar<B::Base,'|'>(cur())){
1651
CRE_Choice * rslt = new CRE_Choice;
1652
rslt->subCMs.push_back(c1);
1655
rslt->subCMs.push_back(Parse_Cp());
1657
while(!AtChar<B::Base,')'>(cur())){
1658
if(AtChar<B::Base,'|'>(cur()))
1661
Syntax_Error(NT_children);
1663
rslt->subCMs.push_back(Parse_Cp());
1670
else if(AtChar<B::Base,','>(cur())){
1671
CRE_Seq * rslt = new CRE_Seq;
1672
rslt->subCMs.push_back(c1);
1675
rslt->subCMs.push_back(Parse_Cp());
1677
while(!AtChar<B::Base,')'>(cur())){
1678
if(AtChar<B::Base,','>(cur()))
1681
Syntax_Error(NT_children);
1683
rslt->subCMs.push_back(Parse_Cp());
1690
else if(AtChar<B::Base,')'>(cur())){
1694
Syntax_Error(NT_children);
1696
if (AtChar<B::Base,'?'>(cur())) {
1700
else if (AtChar<B::Base,'*'>(cur())) {
1702
r = new CRE_Star(r);
1704
else if (AtChar<B::Base,'+'>(cur())) {
1706
r = new CRE_Plus(r);
1712
template <class B, WorkingCharacterSet W>
1713
Content_RE * ParsingEngine<B, W>::Parse_Cp (){
1714
if (AtChar<B::Base,'('>(cur())){
1717
Parse_RemainingChildren();
1720
int nameID = Parse_Name();
1721
CRE_Name * r = new CRE_Name(nameID);
1723
if (AtChar<B::Base,'?'>(cur())) {
1725
return new CRE_Opt(r);
1727
else if (AtChar<B::Base,'*'>(cur())) {
1729
return new CRE_Star(r);
1731
else if (AtChar<B::Base,'+'>(cur())) {
1733
return new CRE_Plus(r);
1739
template <class B, WorkingCharacterSet W>
1740
void ParsingEngine<B, W>::Parse_AttlistDecl (){
1750
Advance(7); /* Skip "ATTLIST. */
1753
int nameID = Parse_Name();
1754
elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1756
old_abspos = AbsPos();
1758
while(!AtChar<B::Base,'>'>(cur())) {
1759
if(old_abspos == AbsPos())
1760
Syntax_Error(NT_AttlistDecl);
1762
int att_nameID = Parse_Name();
1764
attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1765
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1766
ATT_info * this_info = new ATT_info;
1767
this_info->globalATT_id = attID;
1769
if (at_CDATA<B::Base>(cur())){
1771
this_info->attType = CDATA_att;
1773
else if(at_ID<B::Base>(cur())){
1775
this_info->attType = ID_att;
1777
/* Make sure to check IDREFS before IDREF*/
1778
else if(at_IDREFS<B::Base>(cur())){
1780
this_info->attType = IDREFS_att;
1782
else if(at_IDREF<B::Base>(cur())){
1784
this_info->attType = IDREF_att;
1786
else if(at_ENTITY<B::Base>(cur())){
1788
this_info->attType = ENTITY_att;
1790
else if(at_ENTITIES<B::Base>(cur())){
1792
this_info->attType = ENTITIES_att;
1794
/* Make sure to check NMTOKENS before NMTOKEN*/
1795
else if(at_NMTOKENS<B::Base>(cur())){
1797
this_info->attType = NMTOKENS_att;
1799
else if(at_NMTOKEN<B::Base>(cur())){
1801
this_info->attType = NMTOKEN_att;
1803
else if(at_NOTATION<B::Base>(cur())){ /* NotationType = 'NOTATION' S Enumeration
1804
when Nmtoken = Name */
1807
Parse_Notation(this_info);
1808
this_info->attType = NOTATION_att;
1810
else if(AtChar<B::Base,'('>(cur())){
1811
Parse_Enumeration(this_info);
1812
this_info->attType = enumeration_att;
1815
Syntax_Error(NT_AttlistDecl);
1817
Parse_DefaultDecl(this_info);
1820
Parser_Interface<W>::model_info->ElementAttributeData[elemID].push_back(this_info);
1826
template <class B, WorkingCharacterSet W>
1827
void ParsingEngine<B, W>::Parse_Notation (ATT_info * this_info){
1829
if(AtChar<B::Base,'('>(cur()))
1832
Syntax_Error(NT_NotationType);
1835
int notn_nameID = Parse_Name();
1837
/*Notation name is not in the global table!*/
1838
if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1839
Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1842
while(AtChar<B::Base,'|'>(cur())){
1845
notn_nameID = Parse_Name();
1847
if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1848
// Validity_Error(vErr_notatn);
1849
Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1853
if (AtChar<B::Base,')'>(cur()))
1856
Syntax_Error(NT_NotationType);
1859
template <class B, WorkingCharacterSet W>
1860
void ParsingEngine<B, W>::Parse_Enumeration (ATT_info * this_info){
1863
if(AtChar<B::Base,'('>(cur()))
1866
Syntax_Error(NT_Enumeration);
1869
int nmtokenID = Parse_Nmtoken();
1871
this_info->enumValues[nmtokenID]=++(enumCount);
1874
while(AtChar<B::Base,'|'>(cur())){
1877
int nmtokenID = Parse_Nmtoken();
1879
int enumID = this_info->enumValues[nmtokenID];
1881
this_info->enumValues[nmtokenID]=++(enumCount);
1884
else if(!StrictWellFormedness){
1885
Validity_Error(vErr_NoDuplicateTokens);
1889
if (AtChar<B::Base,')'>(cur()))
1892
Syntax_Error(NT_Enumeration);
1895
template <class B, WorkingCharacterSet W>
1896
void ParsingEngine<B, W>::Parse_DefaultDecl (ATT_info * this_info){
1897
if(at_REQUIRED<B::Base>(cur())){
1899
this_info->defaultKind = REQUIRED_att;
1901
else if(at_IMPLIED<B::Base>(cur())){
1903
this_info->defaultKind = IMPLIED_att;
1906
if(at_FIXED<B::Base>(cur())){
1909
this_info->defaultKind = FIXED_att;
1911
else this_info->defaultKind = DEFAULT_att;
1912
if(AtQuote<B::Base>(cur())){
1913
int quot_start = AbsPos()+1;
1915
/* need to normalize */
1916
this_info->defaultValueLgth = AbsPos() - quot_start - 1;
1918
this_info->defaultValue = new unsigned char[this_info->defaultValueLgth+1];
1919
memcpy(this_info->defaultValue, GetCodeUnitPtr(quot_start),this_info->defaultValueLgth);
1920
this_info->defaultValue[this_info->defaultValueLgth] = '\0';
1923
Syntax_Error(NT_DefaultDecl);
1927
template <class B, WorkingCharacterSet W>
1928
void ParsingEngine<B, W>::Parse_Entitydecl (){
1936
Advance(6); /* Skip "ENTITY. */
1939
if (AtChar<B::Base,'%'>(cur())){
1943
int nameID = Parse_Name();
1944
PEntity_info * this_info = new PEntity_info;
1945
int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
1947
Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalPEntityCount);
1948
entityID = Parser_Interface<W>::model_info->globalPEntityCount;
1949
this_info->globalPEntity_id = entityID;
1952
printf("Warning: Entity definition already exist!\n");
1955
if(AtQuote<B::Base>(cur())){
1956
Parse_PEntityValue(this_info);
1957
this_info->is_external = false;
1960
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1961
this_info->is_external = true;
1962
if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1964
Parser_Interface<W>::model_info->PEntityData.push_back(this_info);
1967
int nameID = Parse_Name();
1969
GEntity_info * this_info = new GEntity_info();
1970
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
1972
Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalGEntityCount);
1973
entityID = Parser_Interface<W>::model_info->globalGEntityCount;
1974
this_info->globalGEntity_id = entityID;
1977
printf("Warning: Entity definition already exists!\n");
1981
if(AtQuote<B::Base>(cur())){
1982
Parse_GEntityValue(this_info);
1983
this_info->is_external = false;
1986
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1987
this_info->is_external = true;
1988
if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1989
old_abspos = AbsPos();
1991
if(at_NDATA<B::Base>(cur())){
1992
if(old_abspos == AbsPos())
1993
Syntax_Error(NT_EntityDecl);
1997
name_start = AbsPos();
1998
int nameID = Parse_Name();
1999
lgth = AbsPos() - name_start;
2000
this_info->NDataName = copy_string(GetCodeUnitPtr(name_start),lgth);
2003
Parser_Interface<W>::model_info->GEntityData.push_back(this_info);
2006
if (AtChar<B::Base,'>'>(cur())){
2010
Syntax_Error(NT_EntityDecl);
2013
template <class B, WorkingCharacterSet W>
2014
void ParsingEngine<B, W>::Parse_Notationdecl (){
2017
Advance(8); /* Skip "NOTATION. */
2020
int nameID = Parse_Name();
2022
int notationID = Parser_Interface<W>::model_info->GlobalNotationTable[nameID];
2023
/* notationID == -1: used but not yet defined; == 0: new, > 0 prev. defined */
2024
if(notationID <= 0){
2025
Parser_Interface<W>::model_info->GlobalNotationTable[nameID]=++(Parser_Interface<W>::model_info->globalNotationCount);
2026
notationID = Parser_Interface<W>::model_info->globalNotationCount;
2028
else /*Duplicate notation name!*/
2029
Validity_Error(vErr_NoDuplicateTokens);
2030
Notation_info * this_info = new Notation_info;
2032
Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
2034
if (AtChar<B::Base,'>'>(cur())) {
2038
Syntax_Error(NT_NotationDecl);
2041
template <class B, WorkingCharacterSet W>
2042
void ParsingEngine<B, W>::requireWS(){
2044
int old_abspos = AbsPos();
2046
if(old_abspos == AbsPos())
2050
template <class B, WorkingCharacterSet W>
2051
void ParsingEngine<B, W>::Parse_AttValue(){
2053
int quoteCh = cur()[0];
2054
Advance(1); /* Skip " or ' */
2057
while (cur()[0] != quoteCh){
2058
if (at_CharRef_Start<B::Base>(cur())){
2062
else if (AtChar<B::Base,'&'>(cur())){
2066
else if (AtQuote<B::Base>(cur())) {
2070
else /* if (AtChar<B::Base,'<'>(cur())) */
2071
WF_Error(wfErr_CleanAttrVals);
2076
template <class B, WorkingCharacterSet W>
2077
void ParsingEngine<B, W>::Parse_GEntityValue(GEntity_info * this_info){
2079
int quoteCh = cur()[0];
2080
Advance(1); /* Skip " or ' */
2081
this_info->is_simple = true;
2082
int quot_start = AbsPos();
2085
replText = copy_string(GetCodeUnitPtr(quot_start),AbsPos()-quot_start);
2086
while (cur()[0] != quoteCh){
2087
if (at_CharRef_Start<B::Base>(cur())){
2088
strcat (replText,Replace_CharRef());
2089
quot_start = AbsPos();
2092
else if (AtQuote<B::Base>(cur())) {
2093
quot_start = AbsPos();
2097
else if (at_EOF()) {
2098
Syntax_Error(NT_EntityValue);
2100
else { /* '<' or '&' found */
2101
quot_start = AbsPos();
2104
this_info->is_simple = false;
2106
replText = cat_string (replText,(char *)GetCodeUnitPtr(quot_start), strlen(replText), AbsPos()-quot_start);
2108
this_info->ReplacementText = replText;
2112
template <class B, WorkingCharacterSet W>
2113
char * ParsingEngine<B, W>::Replace_EntityRef(bool& is_simple){
2115
int nameID = Parse_Name();
2116
if (AtChar<B::Base,';'>(cur()))
2119
Syntax_Error(NT_EntityValue);
2120
int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
2122
WF_Error(wfErr_wf_entdeclared);
2124
if (Parser_Interface<W>::model_info->GEntityData[entityID-1]->is_simple == false)
2126
return Parser_Interface<W>::model_info->GEntityData[entityID-1]->ReplacementText;
2131
template <class B, WorkingCharacterSet W>
2132
void ParsingEngine<B, W>::Parse_PEntityValue(PEntity_info * this_info){
2133
fprintf(stderr,"parsing of parameter entity value has not been completed yet.\n");
2137
template <class B, WorkingCharacterSet W>
2138
char * ParsingEngine<B, W>::Replace_CharRef(){
2140
fprintf(stderr,"Replacement of Character Reference has not been completed yet.\n");
2144
template <class B, WorkingCharacterSet W>
2145
void ParsingEngine<B, W>::Parse_Prolog(){
2147
int old_pos = AbsPos();
2148
while (!at_DOCTYPE_start<B::Base>(cur())) {
2149
text_or_markup_start = AbsPos();
2150
if (at_Comment_Start<B::Base>(cur()))
2152
else if (at_PI_Start<B::Base>(cur()))
2155
Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2162
while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
2163
text_or_markup_start = AbsPos();
2164
if (at_Comment_Start<B::Base>(cur()))
2170
Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2173
template <class B, WorkingCharacterSet W>
2174
void ParsingEngine<B, W>::Parse_ExtSubsetDecl() {
2176
int start_pos=AbsPos();
2178
if(at_condSect_start<B::Base>(cur())){
2181
if (at_INCLUDE<B::Base>(cur())){
2184
if(AtChar<B::Base,'['>(cur())){
2186
Parse_ExtSubsetDecl();
2187
if(at_CDATA_End<B::Base>(cur()))
2189
else Syntax_Error(NT_includeSect);
2191
else Syntax_Error(NT_includeSect);
2193
else if (at_IGNORE<B::Base>(cur())){
2196
if(AtChar<B::Base,'['>(cur())){
2197
int section_depth=1;
2200
ScanTextTo(MarkupStart);
2201
if(at_condSect_start<B::Base>(cur())){
2205
else if(at_CDATA_End<B::Base>(cur())){
2211
if(section_depth==0) return;
2213
Syntax_Error(NT_ignoreSectContents);
2215
else Syntax_Error(NT_ignoreSect);
2217
else Syntax_Error(NT_conditionalSect);
2219
else if (AtChar<B::Base,'%'>(cur()))
2220
Parse_PEReference();
2221
else if (at_PI_Start<B::Base>(cur())) {
2224
else if (at_Comment_Start<B::Base>(cur())) {
2227
else if (AtChar<B::Base,'<'>(cur())){
2230
if(AtChar<B::Base,'!'>(cur())){
2232
if(at_ELEMENT<B::Base>(cur()))
2233
Parse_Elementdecl();
2234
else if(at_ATTLIST<B::Base>(cur()))
2235
Parse_AttlistDecl();
2236
else if(at_ENTITY<B::Base>(cur()))
2238
else if(at_NOTATION<B::Base>(cur()))
2239
Parse_Notationdecl();
2241
Syntax_Error(NT_markupdecl);
2245
Syntax_Error(NT_markupdecl);
2248
Syntax_Error(NT_extSubsetDecl);
2251
ExtSubsetDecl_action(GetCodeUnitPtr(start_pos), LengthFrom(start_pos));
2254
/* Parse a valid start or empty element tag. */
2255
template <class B, WorkingCharacterSet W>
2256
inline int ParsingEngine<B, W>::Parse_ValidStartTag (bool& is_emptyStartTag){
2259
int att_name_end, att_val_end;
2260
unsigned char quoteCh;
2263
int nameID = Parse_Name();
2264
int elemID = Parser_Interface<W>::model_info->GlobalElementTable[nameID];
2266
Validity_Error(vErr_elementvalid);
2268
ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
2269
/* The following test optimizes the most common case of a
2270
start tag with no attributes. */
2271
if (AtChar<B::Base,'>'>(cur())) {
2273
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2277
if (AtChar<B::Base,'>'>(cur())) {
2279
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2281
else if (at_EmptyElementDelim<B::Base>(cur())) {
2283
is_emptyStartTag = true;
2284
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2287
/* Must be an attribute-value pair or error. */
2288
att_name_start = AbsPos();
2289
int att_nameID = Parse_Name();
2290
#if (not defined(OMISSION)) or (OMISSION != ATTRIBUTE_UNIQUENESS)
2291
int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
2292
if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
2294
if (LastAttOccurrence[attID] > text_or_markup_start) {
2295
WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
2299
LastAttOccurrence[attID] = att_name_start;
2301
/* The following optimized tests handle the frequently occurring
2302
case that there are no blanks on either side of the equals sign.
2303
In many cases, the very first test handles 100% of actual
2304
attribute-value pairs encountered. */
2305
if (at_EqualsQuote<B::Base>(cur())) Advance(1);
2308
if (!AtChar<B::Base,'='>(cur())) {
2309
Syntax_Error(NT_STag);
2314
if (!AtQuote<B::Base>(cur())) {
2315
Syntax_Error(NT_STag);
2319
att_val_start = AbsPos()+1;
2321
att_val_end = AbsPos()-1;
2322
if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
2323
Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2324
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2327
AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2328
GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2330
/* Now check for end or repeat. Avoid whitespace scan if possible.*/
2331
if (AtChar<B::Base,'>'>(cur())) {
2333
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2336
else if (at_EmptyElementDelim<B::Base>(cur())) {
2338
is_emptyStartTag = true;
2339
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2343
if (AtChar<B::Base,'>'>(cur())) {
2345
StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2348
else if (at_EmptyElementDelim<B::Base>(cur())) {
2350
is_emptyStartTag = true;
2351
EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2354
else if (AbsPos() == att_val_end + 1) {
2355
/* No WS following att value */
2356
Syntax_Error(NT_STag);
2364
template <class B, WorkingCharacterSet W>
2365
int ParsingEngine<B, W>::Parse_ValidElement() {
2366
bool is_emptyStartTag = false;
2367
int nameID = Parse_ValidStartTag(is_emptyStartTag);
2369
printf("Parse_ValidElement: nameID = %d, name = %s, is_emptyStartTag=%i\n",nameID, Parser_Interface<W>::model_info->symbol_table->Get_UTF8_name(nameID), is_emptyStartTag);
2371
ContentModel * cm = Parser_Interface<W>::model_info->ContentModelData[nameID];
2372
switch (cm->cm_type) {
2374
if (!is_emptyStartTag) {
2375
if (at_EndTag_Start<B::Base>(cur())) {
2376
Parse_WF_EndTag(nameID);
2379
Validity_Error(vErr_elementvalid);
2384
if (!is_emptyStartTag) {
2386
Parse_WF_EndTag(nameID);
2390
if (!is_emptyStartTag) {
2391
Parse_MixedContent(((CM_Mixed *) cm)->elements);
2392
Parse_WF_EndTag(nameID);
2396
CM_RegExp * cre = (CM_RegExp *) cm;
2397
int content_state = 0;
2398
if (!is_emptyStartTag) {
2399
Parse_ValidContent(cre, content_state);
2401
printf("Final content_state = %i, nameID = %i\n", content_state, nameID);
2403
Parse_WF_EndTag(nameID);
2405
if (cre->transition_map[content_state][0]==0) {
2406
Validity_Error(vErr_elementvalid);
2412
template <class B, WorkingCharacterSet W>
2413
void ParsingEngine<B, W>::Parse_ValidContent(CM_RegExp * cre, int & cur_state) {
2416
/* If non-null report WS WS_action()? */
2417
text_or_markup_start = AbsPos();
2418
if (at_EndTag_Start<B::Base>(cur())) {
2421
else if (at_ElementTag_Start<B::Base>(cur())) {
2422
int nameID = Parse_ValidElement();
2424
printf("Content model state transition %i", cur_state);
2426
cur_state = cre->transition_map[cur_state][nameID];
2428
printf("-> %i\n", cur_state);
2431
else if (at_Comment_Start<B::Base>(cur())) {
2434
else if (at_PI_Start<B::Base>(cur())) {
2437
else if (AtChar<B::Base,'&'>(cur())) {
2438
Parse_ValidEntityRef(cre, cur_state);
2440
printf("EntityRef complete, cur_state = %i\n", cur_state);
2444
else if (at_EOF()) {
2447
else if (AtChar<B::Base,'<'>(cur())) {
2448
Syntax_Error(NT_markupdecl);
2451
Validity_Error(vErr_elementvalid);
2457
template <class B, WorkingCharacterSet W>
2458
void ParsingEngine<B, W>::Parse_AnyContent() {
2460
text_or_markup_start = AbsPos();
2461
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2462
if (at_ElementTag_Start<B::Base>(cur())) {
2463
text_if_nonnull_action(false);
2464
int nameID = Parse_ValidElement();
2466
else if (at_EndTag_Start<B::Base>(cur())) {
2467
text_if_nonnull_action(false);
2470
else if (at_Comment_Start<B::Base>(cur())) {
2471
text_if_nonnull_action(false);
2474
else if (at_CharRef_Start<B::Base>(cur())) {
2475
text_if_nonnull_action(true);
2478
else if (AtChar<B::Base,'&'>(cur())) {
2479
text_if_nonnull_action(true);
2480
Parse_EntityRef_inAnyContent();
2482
else if (at_CDATA_Start<B::Base>(cur())) {
2483
text_if_nonnull_action(true);
2486
else if (at_PI_Start<B::Base>(cur())) {
2487
text_if_nonnull_action(false);
2490
else if (at_CDATA_End<B::Base>(cur())) {
2491
text_if_nonnull_action(true);
2493
Syntax_Error(NT_CharData);
2495
else if (at_EOF()) {
2496
text_if_nonnull_action(false);
2499
else if (AtChar<B::Base,'<'>(cur())) {
2500
Syntax_Error(NT_markupdecl);
2508
template <class B, WorkingCharacterSet W>
2509
void ParsingEngine<B, W>::Parse_MixedContent(symbol_set_t elems) {
2511
text_or_markup_start = AbsPos();
2512
ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2513
/* if (AtChar<B::Base,'<'>(cur())) {
2514
text_if_nonnull_action();
2515
Parse_Markup<B, W>();
2517
if (at_ElementTag_Start<B::Base>(cur())) {
2518
text_if_nonnull_action(false);
2519
int nameID = Parse_ValidElement();
2520
if (elems[nameID] == 0) {
2521
Validity_Error(vErr_elementvalid);
2524
else if (at_EndTag_Start<B::Base>(cur())) {
2525
text_if_nonnull_action(false);
2528
else if (at_Comment_Start<B::Base>(cur())) {
2529
text_if_nonnull_action(false);
2532
else if (at_CharRef_Start<B::Base>(cur())) {
2533
text_if_nonnull_action(true);
2536
else if (AtChar<B::Base,'&'>(cur())) {
2537
text_if_nonnull_action(true);
2538
Parse_EntityRef_inMixed(elems);
2540
else if (at_CDATA_Start<B::Base>(cur())) {
2541
text_if_nonnull_action(true);
2544
else if (at_PI_Start<B::Base>(cur())) {
2545
text_if_nonnull_action(false);
2548
else if (at_CDATA_End<B::Base>(cur())) {
2549
text_if_nonnull_action(true);
2551
Syntax_Error(NT_CharData);
2553
else if (at_EOF()) {
2554
text_if_nonnull_action(false);
2557
else if (AtChar<B::Base,'<'>(cur())) {
2558
Syntax_Error(NT_markupdecl);
2568
template <class B, WorkingCharacterSet W>
2569
int ParsingEngine<B, W>::Parse_Name() {
2570
int name_pos = AbsPos();
2572
int lgth = AbsPos()-name_pos;
2573
int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Name(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2574
if (nameID != 0) return nameID;
2576
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2577
char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2578
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2579
return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved();
2584
// int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Name() {
2585
// int name_pos = AbsPos();
2586
// ScanTo(NameFollow);
2587
// int lgth = AbsPos()-name_pos;
2588
// // int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2589
// // if (nameID != 0) return nameID;
2591
// int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2592
// char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2593
// byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2594
// return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved();
2598
// template <WorkingCharacterSet W>
2599
// inline int ParsingEngine<UTF8_Buffer, W>::Parse_Name() {
2600
// int name_pos = AbsPos();
2601
// ScanTo(NameFollow);
2602
// int lgth = AbsPos()-name_pos;
2603
// return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2607
inline int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Name() {
2608
int name_pos = AbsPos();
2610
int lgth = AbsPos()-name_pos;
2611
return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2614
template <class B, WorkingCharacterSet W>
2615
int ParsingEngine<B, W>::Parse_Nmtoken() {
2616
int name_pos = AbsPos();
2618
int lgth = AbsPos()-name_pos;
2619
int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Nmtoken(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2620
if (nameID != 0) return nameID;
2622
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2623
char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2624
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2625
return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2630
int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Nmtoken() {
2631
int name_pos = AbsPos();
2633
int lgth = AbsPos()-name_pos;
2634
// int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2635
// if (nameID != 0) return nameID;
2637
int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2638
char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2639
byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2640
return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2644
// int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Nmtoken() {
2645
// int name_pos = AbsPos();
2646
// ScanTo(NameFollow);
2647
// int lgth = AbsPos()-name_pos;
2648
// return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Nmtoken(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2651
template <class B, WorkingCharacterSet W>
2652
void ParsingEngine<B, W>::Parse_DocumentContent() {
2653
Parser_Interface<W>::DocumentStart_action();
2654
#if (VALIDATION_MODE == ON)
2656
Parse_ValidContent(Parser_Interface<W>::model_info->rootModel, cur_state);
2657
if (Parser_Interface<W>::model_info->rootModel->transition_map[cur_state][0]==0) {
2658
Validity_Error(vErr_elementvalid);
2661
#if (VALIDATION_MODE == OFF)
2664
while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
2665
if (at_Comment_Start<B::Base>(cur()))
2672
Syntax_Error(NT_document);
2675
Parser_Interface<W>::DocumentEnd_action();
2678
#ifdef MARKUP_PASS_CONTROL
2679
// Test routine as an alternative to MarkupPass.
2680
template <class B, WorkingCharacterSet W>
2681
void ParsingEngine<B, W>::ParseContent() {
2684
int charref_code = 0;
2685
int general_ref_code = 0;
2686
DocumentStart_action();
2687
bool is_emptyStartTag = false;
2689
text_or_markup_start = AbsPos();
2690
ScanTo(MarkupStart); /* '<', '&', or ']' for 0b11']]>' test */
2691
/* if (AtChar<B::Base,'<'>(cur())) {
2692
text_if_nonnull_action();
2693
Parse_Markup<B, W>();
2695
if (at_EndTag_Start<B::Base>(cur())) {
2696
end_code |= AbsPos();
2698
else if (AtChar<B::Base,'<'>(cur())) {
2699
start_code += AbsPos();
2701
else if (at_CharRef_Start<B::Base>(cur())) {
2704
else if (AtChar<B::Base,'&'>(cur())) {
2705
general_ref_code += 1;
2707
else if (at_EOF()) break;
2710
printf("Start_code: %i\n", start_code);
2711
printf("End_code: %i\n", end_code);
2712
printf("general_ref_code: %i\n", general_ref_code);
2713
printf("charref_code: %i\n", charref_code);
2714
DocumentEnd_action();
2718
#ifdef MARKUP_SORTING
2719
// Little endian codes for [&#/] stream.
2720
enum MarkupSortCodes {
2721
StartTagTwoBitCode = 0,
2722
EndTagTwoBitCode = 2,
2728
static inline int GetBitPair(SIMD_type * stream, int bit_posn) {
2729
return bitstream_segment_from(stream, bit_posn) & 3;
2732
template <class B, WorkingCharacterSet W>
2733
void ParsingEngine<B, W>::ParseContent() {
2734
/*vector<int> MarkupPositions[4];*/
2735
int MarkupPositions[4][BUFFER_SIZE];
2736
int MarkupCounts[4];
2739
int charref_code = 0;
2740
int general_ref_code = 0;
2742
DocumentStart_action();
2743
bool is_emptyStartTag = false;
2744
for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2745
text_or_markup_start = AbsPos();
2747
unsigned long segment = bitstream_segment_from(buf->item_stream[MarkupStart], buffer_rel_pos);
2748
//printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2750
buffer_rel_pos += cfzl(segment);
2751
text_or_markup_start = AbsPos();
2752
int markup_code = GetBitPair(buf->item_stream[AmpHashSlash], buffer_rel_pos);
2753
MarkupPositions[markup_code][MarkupCounts[markup_code]] = AbsPos();
2754
MarkupCounts[markup_code]++;
2758
buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
2759
// printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2761
if (buffer_rel_pos >= buffer_limit_pos) {
2762
/* for (int i = 0; i < MarkupCounts[StartTagTwoBitCode]; i++) {
2763
start_code += MarkupPositions[StartTagTwoBitCode][i];
2765
for (int i = 0; i < MarkupCounts[EndTagTwoBitCode]; i++) {
2766
end_code |= MarkupPositions[EndTagTwoBitCode][i];
2768
for (int i = 0; i < MarkupCounts[GeneralRefCode]; i++) {
2769
general_ref_code += 1;
2771
for (int i = 0; i < MarkupCounts[CharRefCode]; i++) {
2774
/* printf("Start_code: %i\n", start_code);
2775
printf("End_code: %i\n", end_code);
2776
printf("general_ref_code: %i\n", general_ref_code);
2777
printf("charref_code: %i\n", charref_code);*/
2778
for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2779
if (buffer_rel_pos >= BUFFER_SIZE) {
2780
AdjustBufferEndForIncompleteSequences();
2781
Parser_Interface<W>::FinalizeBuffer_action();
2790
/* vector<int>::iterator i;
2791
for (i = MarkupPositions[StartTagTwoBitCode].begin(); i != MarkupPositions[StartTagTwoBitCode].end(); i++) {
2794
for (i = MarkupPositions[EndTagTwoBitCode].begin(); i != MarkupPositions[EndTagTwoBitCode].end(); i++) {
2797
for (i = MarkupPositions[GeneralRefCode].begin(); i != MarkupPositions[GeneralRefCode].end(); i++) {
2798
general_ref_code += 1;
2800
for (i = MarkupPositions[CharRefCode].begin(); i != MarkupPositions[CharRefCode].end(); i++) {
2803
printf("Start_code: %i\n", start_code);
2804
printf("End_code: %i\n", end_code);
2805
printf("general_ref_code: %i\n", general_ref_code);
2806
printf("charref_code: %i\n", charref_code);
2807
DocumentEnd_action();