/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090922/src/engine.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-09-23 17:13:04 UTC
  • Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd
Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*  engine.c - Parabix XML parsing engine.
 
2
    Copyright (c) 2007, 2008, Robert D. Cameron and Dan Lin.
 
3
    Licensed to the public under the Open Software License 3.0.
 
4
    Licensed to International Characters, Inc., under the Academic
 
5
    Free License 3.0.
 
6
*/
 
7
 
 
8
#include "engine.h"
 
9
#include "byteplex.h"
 
10
#include "xmldecl.h"
 
11
#include "bytelex.h"
 
12
#include "bitlex.h"
 
13
#include "contentmodel.h"
 
14
#include "contentmodel.c"
 
15
#include "xml_error.h"
 
16
 
 
17
#include <assert.h>
 
18
#include <stdlib.h>
 
19
#include <errno.h>
 
20
#include <string.h>
 
21
#include <string>
 
22
#include <iostream>
 
23
using namespace std;
 
24
 
 
25
inline char * copy_string (unsigned char * s, int lgth){
 
26
        char * d = new char[lgth+1];
 
27
        memcpy(d, (char *)s,lgth);
 
28
        d[lgth] = '\0';
 
29
        return d;
 
30
}
 
31
 
 
32
inline char * cat_string (char * s1, char * s2, int lgth1, int lgth2){
 
33
        char * s = new char[lgth1 + lgth2 + 1];
 
34
        memcpy(s, s1,lgth1);
 
35
        memcpy(&s[lgth1],s2,lgth2);
 
36
        s[lgth1 + lgth2] = '\0';
 
37
        return s;
 
38
}
 
39
 
 
40
 
 
41
template <WorkingCharacterSet W>
 
42
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * filename) {
 
43
 
 
44
        int chars_read;
 
45
        unsigned char signature[4];
 
46
        FILE * infile;
 
47
        infile = fopen(filename, "rb");
 
48
        if (!infile) {
 
49
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
 
50
                exit(-1);
 
51
        }
 
52
        fread(signature,1,4,infile);
 
53
        Entity_Info * e = new Entity_Info;
 
54
        Model_Info * m = new Model_Info;
 
55
        e->AnalyzeSignature(signature);
 
56
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
 
57
        b->InitializeBuffer(signature,4);
 
58
        b->DoByteplex();
 
59
        b->PreparePseudoASCII_Stream();
 
60
 
 
61
        if (e->code_unit_base == ASCII) {
 
62
                XML_Decl_Parser<ASCII> decl_parser(b);
 
63
                decl_parser.ReadXMLInfo(*e);
 
64
                if (e->code_unit_size == SingleByte) {
 
65
                        if (!(e->has_encoding_decl) || at_UTF_8(e->encoding))
 
66
                                return new ParsingEngine< UTF8_Buffer, W>(e, m, b, false);
 
67
                        else return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
 
68
                }
 
69
                else if (e->code_unit_size == DoubleByte) {
 
70
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
 
71
                }
 
72
                else if (e->code_unit_size == QuadByte) {
 
73
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
 
74
                }
 
75
        }
 
76
        else /* if (e->code_unit_base == EBCDIC) */ {
 
77
                XML_Decl_Parser<EBCDIC> decl_parser(b);
 
78
                decl_parser.ReadXMLInfo(*e);
 
79
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
 
80
        }
 
81
}
 
82
 
 
83
template <WorkingCharacterSet W>
 
84
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * filename, Model_Info * m) {
 
85
 
 
86
        int chars_read;
 
87
        unsigned char signature[4];
 
88
        FILE * infile;
 
89
        infile = fopen(filename, "rb");
 
90
        if (!infile) {
 
91
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
 
92
                exit(-1);
 
93
        }
 
94
        fread(signature,1,4,infile);
 
95
        Entity_Info * e = new Entity_Info;
 
96
        e->AnalyzeSignature(signature);
 
97
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
 
98
        b->InitializeBuffer(signature,4);
 
99
        b->DoByteplex();
 
100
        b->PreparePseudoASCII_Stream();
 
101
        if (e->code_unit_base == ASCII) {
 
102
                XML_Decl_Parser<ASCII> decl_parser(b);
 
103
                decl_parser.ReadXMLInfo(*e);
 
104
                if (e->code_unit_size == SingleByte) {
 
105
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, true);
 
106
                }
 
107
                else if (e->code_unit_size == DoubleByte) {
 
108
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, true);
 
109
                }
 
110
                else if (e->code_unit_size == QuadByte) {
 
111
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, true);
 
112
                }
 
113
        }
 
114
        else /* if (e->code_unit_base == EBCDIC) */ {
 
115
                XML_Decl_Parser<EBCDIC> decl_parser(b);
 
116
                decl_parser.ReadXMLInfo(*e);
 
117
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, true);
 
118
        }
 
119
}
 
120
 
 
121
template <WorkingCharacterSet W>
 
122
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(const char * byte_buffer, int byte_count, Entity_Info * e1, Model_Info * m){
 
123
        Entity_Info * e = new Entity_Info;
 
124
        e->BOM_units = 0;
 
125
        e->code_unit_base=e1->code_unit_base;
 
126
        e->code_unit_size=e1->code_unit_size;
 
127
        e->version=e1->version;
 
128
        e->encoding=e1->encoding;
 
129
        e->content_start = 0;
 
130
        Byteplex * b = Byteplex::ByteplexFactory(e, (unsigned char *) byte_buffer, byte_count);
 
131
        b->DoByteplex();
 
132
        b->PreparePseudoASCII_Stream();
 
133
        if (e->code_unit_base == ASCII) {
 
134
                if (e->code_unit_size == SingleByte) {
 
135
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
 
136
                }
 
137
                else if (e->code_unit_size == DoubleByte) {
 
138
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
 
139
                }
 
140
                else if (e->code_unit_size == QuadByte) {
 
141
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
 
142
                }
 
143
        }
 
144
        else /* if (e->code_unit_base == EBCDIC) */ {
 
145
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
 
146
        }
 
147
}
 
148
 
 
149
template <WorkingCharacterSet W>
 
150
Parser_Interface<W>::~Parser_Interface() {
 
151
}
 
152
 
 
153
 
 
154
template <WorkingCharacterSet W>
 
155
bool Parser_Interface<W>::has_ByteOrderMark() {
 
156
        return entity_Info->BOM_units > 0;
 
157
}
 
158
 
 
159
template <WorkingCharacterSet W>
 
160
XML_version Parser_Interface<W>::get_version() {
 
161
        return entity_Info->version;
 
162
}
 
163
 
 
164
template <WorkingCharacterSet W>
 
165
XML_standalone Parser_Interface<W>::standalone_status() {
 
166
        return entity_Info->standalone;
 
167
}
 
168
 
 
169
template <WorkingCharacterSet W>
 
170
bool Parser_Interface<W>::has_EncodingDecl() {
 
171
        return entity_Info->has_encoding_decl;
 
172
}
 
173
 
 
174
template <WorkingCharacterSet W>
 
175
unsigned char * Parser_Interface<W>::get_Encoding() {
 
176
        return entity_Info->encoding;
 
177
}
 
178
 
 
179
template <class B, WorkingCharacterSet W>
 
180
inline unsigned char * ParsingEngine<B, W>::GetCodeUnitPtr(int pos) {
 
181
        int rel_pos = pos - buffer_base_pos;
 
182
        return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) B::Size];
 
183
}
 
184
 
 
185
template <>
 
186
inline unsigned char * ParsingEngine<UTF8_Buffer, UTF_8>::GetCodeUnitPtr(int pos) {
 
187
        int rel_pos = pos - buffer_base_pos;
 
188
        return &((unsigned char *) (x8data))[rel_pos];
 
189
}
 
190
 
 
191
 
 
192
 
 
193
 
 
194
template <class B, WorkingCharacterSet W>
 
195
ParsingEngine<B, W>::ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external) : Parser_Interface<W> () {
 
196
        Parser_Interface<W>::entity_Info = e;
 
197
        Parser_Interface<W>::model_info = m;
 
198
        byteplex = b;
 
199
 
 
200
//      m->symbol_table = new Symbol_Table();
 
201
//      m->SimpleEntity("lt", "<");
 
202
//      m->SimpleEntity("gt", ">");
 
203
//      m->SimpleEntity("amp", "&");
 
204
//      m->SimpleEntity("quot", "\"");
 
205
//      m->SimpleEntity("apos", "'");
 
206
        m->symbol_table->version = e->version;
 
207
 
 
208
        StrictWellFormedness=false;
 
209
        LastAttOccurrence.assign(m->globalAttributeCount+1, 0);
 
210
 
 
211
 
 
212
        bitplex = new Bitplex;
 
213
        buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
 
214
 
 
215
  /* Install sentinels for every lexical item stream*/
 
216
#ifdef TEMPLATED_SIMD_LIB
 
217
        BitBlock sentinel_value = simd<1>::constant<1>();
 
218
#endif
 
219
#ifndef TEMPLATED_SIMD_LIB
 
220
        BitBlock sentinel_value = simd_const_1(1);
 
221
#endif
 
222
 
 
223
#ifdef OPTIMIZE_SHORT_SCAN
 
224
        sentinel_value = sisd_sfli(sentinel_value, 8*sizeof(unsigned long));
 
225
#endif
 
226
 
 
227
        for (int j = minLexicalItem; j < LexicalItemCount; j++) {
 
228
                buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
 
229
        }
 
230
 
 
231
        buffer_base_pos = 0;
 
232
        buffer_rel_pos = e->content_start;
 
233
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
234
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
235
        x8data = byteplex->x8data;
 
236
        lexer = Lexer<B::Base>::LexerFactory(e, buf);
 
237
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
238
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
239
}
 
240
 
 
241
template <class B, WorkingCharacterSet W>
 
242
ParsingEngine<B, W>::~ParsingEngine() {
 
243
  // How do we do this?  Parser_Interface<W>::model_info->~Model_Info();
 
244
  Parser_Interface<W>::entity_Info->~Entity_Info();
 
245
  delete byteplex;
 
246
  delete bitplex;
 
247
  simd_delete((SIMD_type *) buf);
 
248
  delete lexer;
 
249
}
 
250
 
 
251
template <class B, WorkingCharacterSet W>
 
252
void ParsingEngine<B, W>::AdvanceBuffers(){
 
253
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
254
        code_clocker->start_interval();
 
255
#endif
 
256
 
 
257
        int advance_amt = text_or_markup_start - buffer_base_pos;
 
258
        advance_amt &= -PACKSIZE; // maintain alignment
 
259
        byteplex->AdvanceInputBuffer(advance_amt);
 
260
        buffer_base_pos += advance_amt;
 
261
        buffer_rel_pos -= advance_amt;
 
262
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
263
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
264
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
265
        code_clocker->start_interval();
 
266
#endif
 
267
        byteplex->DoByteplex();
 
268
        byteplex->PreparePseudoASCII_Stream();
 
269
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
270
        code_clocker->end_interval(buffer_limit_pos);
 
271
#endif
 
272
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
273
        code_clocker->start_interval();
 
274
#endif
 
275
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
276
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
277
        code_clocker->end_interval(buffer_limit_pos);
 
278
#endif
 
279
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
280
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
281
        code_clocker->end_interval(buffer_limit_pos);
 
282
#endif
 
283
 
 
284
}
 
285
 
 
286
template <>
 
287
void ParsingEngine<U16_Buffer, UTF_16>::AdvanceBuffers(){
 
288
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
289
        code_clocker->start_interval();
 
290
#endif
 
291
 
 
292
        int advance_amt = text_or_markup_start - buffer_base_pos;
 
293
        advance_amt &= -PACKSIZE; // maintain alignment
 
294
        byteplex->AdvanceInputBuffer(advance_amt);
 
295
        buffer_base_pos += advance_amt;
 
296
        buffer_rel_pos -= advance_amt;
 
297
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
298
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
299
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
300
        code_clocker->start_interval();
 
301
#endif
 
302
        byteplex->DoByteplex();
 
303
        if (at_UTF_16(Parser_Interface<UTF_16>::entity_Info->encoding)) ((U16_Buffer *) byteplex)->Validate_UTF16();
 
304
        byteplex->PreparePseudoASCII_Stream();
 
305
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
306
        code_clocker->end_interval(buffer_limit_pos);
 
307
#endif
 
308
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
309
        code_clocker->start_interval();
 
310
#endif
 
311
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
312
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
313
        code_clocker->end_interval(buffer_limit_pos);
 
314
#endif
 
315
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
316
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
317
        code_clocker->end_interval(buffer_limit_pos);
 
318
#endif
 
319
 
 
320
}
 
321
 
 
322
template <class B, WorkingCharacterSet W>
 
323
inline unsigned char * ParsingEngine<B, W>::cur() const {
 
324
  return &((unsigned char *) x8data)[buffer_rel_pos];
 
325
}
 
326
 
 
327
template <class B, WorkingCharacterSet W>
 
328
inline int ParsingEngine<B, W>::AbsPos() const {
 
329
  return buffer_base_pos + buffer_rel_pos;
 
330
}
 
331
 
 
332
template <class B, WorkingCharacterSet W>
 
333
inline int ParsingEngine<B, W>::LengthFrom(int start_pos) const {
 
334
  return buffer_base_pos + buffer_rel_pos - start_pos;
 
335
}
 
336
 
 
337
 
 
338
 
 
339
template <class B, WorkingCharacterSet W>
 
340
inline int ParsingEngine<B, W>::BufferRelPos() const {
 
341
  return buffer_rel_pos;
 
342
}
 
343
 
 
344
 
 
345
template <class B, WorkingCharacterSet W>
 
346
inline bool ParsingEngine<B, W>::at_EOF() const {
 
347
  return (buffer_rel_pos >= buffer_limit_pos) &&
 
348
         (buffer_limit_pos < BUFFER_SIZE);
 
349
}
 
350
 
 
351
//template <class B, WorkingCharacterSet W>
 
352
//inline void ParsingEngine<B, W>::Advance(int n) {
 
353
//      buffer_rel_pos += n;
 
354
//  if (buffer_rel_pos >= BUFFER_SIZE) {
 
355
//      Parser_Interface<W>::FinalizeBuffer_action();
 
356
//      AdvanceBuffers();
 
357
//  }
 
358
//}
 
359
 
 
360
#define Advance(n) \
 
361
do {\
 
362
        buffer_rel_pos += n; \
 
363
        if (buffer_rel_pos >= BUFFER_SIZE) {    \
 
364
                Parser_Interface<W>::FinalizeBuffer_action();\
 
365
        AdvanceBuffers();\
 
366
        }\
 
367
} while(0)
 
368
 
 
369
 
 
370
template <class B, WorkingCharacterSet W>
 
371
void ParsingEngine<B, W>::AdjustBufferEndForIncompleteSequences() {
 
372
}
 
373
 
 
374
template <>
 
375
void ParsingEngine<UTF8_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
 
376
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
 
377
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
 
378
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
 
379
}
 
380
 
 
381
template <>
 
382
void ParsingEngine<U16_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
 
383
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
 
384
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
 
385
}
 
386
 
 
387
template <>
 
388
void ParsingEngine<UTF8_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
 
389
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
 
390
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
 
391
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
 
392
}
 
393
 
 
394
template <>
 
395
void ParsingEngine<U16_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
 
396
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
 
397
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
 
398
}
 
399
 
 
400
 
 
401
 
 
402
#ifdef OPTIMIZE_SHORT_SCAN
 
403
//
 
404
//  Inline ScanTo with unrolled first test that should almost always
 
405
//  succeed for short scans.
 
406
#define ScanTo(item) \
 
407
do {\
 
408
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
409
        if (segment != 0) buffer_rel_pos += cfzl(segment);\
 
410
        else {\
 
411
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
412
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
413
                while (buffer_rel_pos >= BUFFER_SIZE) {\
 
414
                        buffer_rel_pos = BUFFER_SIZE;\
 
415
                        AdjustBufferEndForIncompleteSequences();\
 
416
                        Parser_Interface<W>::FinalizeBuffer_action();\
 
417
                        AdvanceBuffers();\
 
418
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
419
                }\
 
420
        }\
 
421
} while(0)
 
422
 
 
423
// The following version seems cleaner, but measured mispredictions are higher
 
424
// #define ScanTo(item) \
 
425
// do {\
 
426
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
427
//      while (unlikely (segment == 0)) {\
 
428
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
429
//              if (buffer_rel_pos >= BUFFER_SIZE) {\
 
430
//                      buffer_rel_pos = BUFFER_SIZE;\
 
431
//                      AdjustBufferEndForIncompleteSequences();\
 
432
//                      Parser_Interface<W>::FinalizeBuffer_action();\
 
433
//                      AdvanceBuffers();\
 
434
//              }\
 
435
//              segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
436
//      }\
 
437
//      buffer_rel_pos += cfzl(segment);\
 
438
// } while(0)
 
439
//
 
440
// #define ScanTextTo(item) \
 
441
// do {\
 
442
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
443
//      text_or_markup_start = AbsPos();\
 
444
//      if (segment != 0) buffer_rel_pos += cfzl(segment);\
 
445
//      else {\
 
446
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
447
//              buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
448
//              while (buffer_rel_pos >= BUFFER_SIZE) {\
 
449
//                      buffer_rel_pos = BUFFER_SIZE;\
 
450
//                      AdjustBufferEndForIncompleteSequences();\
 
451
//                      Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);\
 
452
//                      text_or_markup_start = AbsPos();\
 
453
//                      Parser_Interface<W>::FinalizeBuffer_action();\
 
454
//                      AdvanceBuffers();\
 
455
//                      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
456
//              }\
 
457
//      }\
 
458
// } while(0)
 
459
 
 
460
template <class B, WorkingCharacterSet W>
 
461
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
 
462
        text_or_markup_start = AbsPos();
 
463
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);
 
464
        if (segment != 0) buffer_rel_pos += cfzl(segment);
 
465
        else {
 
466
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
 
467
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
468
                while (buffer_rel_pos >= BUFFER_SIZE) {
 
469
                        buffer_rel_pos = BUFFER_SIZE;
 
470
                        AdjustBufferEndForIncompleteSequences();
 
471
                        Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
472
                        text_or_markup_start = AbsPos();
 
473
                        Parser_Interface<W>::FinalizeBuffer_action();
 
474
                        AdvanceBuffers();
 
475
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
476
                }
 
477
        }
 
478
}
 
479
 
 
480
#endif
 
481
 
 
482
#ifndef OPTIMIZE_SHORT_SCAN
 
483
 
 
484
// #define ScanTo(item) \
 
485
// do {\
 
486
//   buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
487
//   while (buffer_rel_pos >= BUFFER_SIZE) {\
 
488
//      AdjustBufferEndForIncompleteSequences();\
 
489
//      Parser_Interface<W>::FinalizeBuffer_action();\
 
490
//      AdvanceBuffers();\
 
491
//      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
492
//   }\
 
493
// } while(0)
 
494
 
 
495
 
 
496
template <class B, WorkingCharacterSet W>
 
497
inline void ParsingEngine<B, W>::ScanTo(int item) {
 
498
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
499
        while (buffer_rel_pos >= BUFFER_SIZE) {
 
500
                AdjustBufferEndForIncompleteSequences();
 
501
                Parser_Interface<W>::FinalizeBuffer_action();
 
502
                AdvanceBuffers();
 
503
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
504
        }
 
505
}
 
506
 
 
507
template <class B, WorkingCharacterSet W>
 
508
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
 
509
        text_or_markup_start = AbsPos();
 
510
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
511
        while (buffer_rel_pos >= BUFFER_SIZE) {
 
512
                AdjustBufferEndForIncompleteSequences();
 
513
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
514
                text_or_markup_start = AbsPos();
 
515
                Parser_Interface<W>::FinalizeBuffer_action();
 
516
                AdvanceBuffers();
 
517
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
518
        }
 
519
}
 
520
#endif
 
521
 
 
522
template <class B, WorkingCharacterSet W>
 
523
void ParsingEngine<B, W>::WF_Error (XML_Constraint errCode) {
 
524
        printf("Error at position %i in input.\n", AbsPos());
 
525
        ShowConstraintError(errCode);
 
526
        exit(-1);
 
527
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
528
}
 
529
 
 
530
 
 
531
template <class B, WorkingCharacterSet W>
 
532
void ParsingEngine<B, W>::Validity_Error (XML_Constraint errCode) {
 
533
        printf("Error at position %i in input.\n", AbsPos());
 
534
        ShowConstraintError(errCode);
 
535
        exit(-1);
 
536
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
537
}
 
538
 
 
539
template <class B, WorkingCharacterSet W>
 
540
void ParsingEngine<B, W>::Syntax_Error (XML_NonTerminal errNT) {
 
541
        printf("Error at position %i in input.\n", AbsPos());
 
542
        ShowSyntaxError(errNT);
 
543
        exit(-1);
 
544
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
545
}
 
546
 
 
547
 
 
548
/* Parse a comment beginning "<!--" */
 
549
template <class B, WorkingCharacterSet W>
 
550
void ParsingEngine<B, W>::Parse_Comment() {
 
551
 
 
552
        Advance(4); /* Skip "<!--". */
 
553
        ScanTo(Hyphen);
 
554
        while (!at_DoubleHyphen<B::Base>(cur())) {
 
555
                if(at_EOF())
 
556
                        Syntax_Error(NT_CDSect);
 
557
                Advance(2); /* Skip hyphen-nonhyphen pair */
 
558
                ScanTo(Hyphen);
 
559
        }
 
560
        if (at_Comment_End<B::Base>(cur())) {
 
561
                Advance(3); /* Skip "-->". */
 
562
                Comment_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
563
        }
 
564
        else {
 
565
                Advance(2);  /* "--" */
 
566
                Syntax_Error(NT_Comment);
 
567
        }
 
568
}
 
569
 
 
570
/* Parse an end tag beginning "</" */
 
571
template <class B, WorkingCharacterSet W>
 
572
inline void ParsingEngine<B, W>::Parse_EndTag() {
 
573
        Advance(2); /* Skip "</". */
 
574
        int nameID = Parse_Name();
 
575
        if (AtChar<B::Base,'>'>(cur())) {
 
576
                Advance(1);
 
577
                EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
578
        }
 
579
        else {
 
580
                ScanTo(NonWS);
 
581
                if (AtChar<B::Base,'>'>(cur())) {
 
582
                        Advance(1);
 
583
                        EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
584
                }
 
585
                else Syntax_Error(NT_ETag);
 
586
        }
 
587
}
 
588
 
 
589
/* Parse a CDATA section beginning "<![CDATA". */
 
590
template <class B, WorkingCharacterSet W>
 
591
void ParsingEngine<B, W>::Parse_CDATA() {
 
592
                Advance(8); /* Skip "<![CDATA". */
 
593
        if (!AtChar<B::Base,'['>(cur())) {
 
594
                Syntax_Error(NT_CDStart);
 
595
        }
 
596
        else {
 
597
                Advance(1);
 
598
                CDATA_start_action(GetCodeUnitPtr(text_or_markup_start));
 
599
                text_or_markup_start = AbsPos();
 
600
                ScanTextTo(CD_End_check);
 
601
                while (!at_CDATA_End<B::Base>(cur())) {
 
602
                        if (at_EOF())
 
603
                                Syntax_Error(NT_CDSect);
 
604
                        Advance(1);
 
605
                        ScanTextTo(CD_End_check);
 
606
                }
 
607
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
608
                Advance(3); /* Skip "]]>". */
 
609
                CDATA_end_action(GetCodeUnitPtr(AbsPos()));
 
610
        }
 
611
}
 
612
 
 
613
template <class B, WorkingCharacterSet W>
 
614
void ParsingEngine<B, W>::Parse_EntityRef() {
 
615
    Advance(1);  // skip "&"
 
616
        int nameID = Parse_Name();  /* Name delimiter */
 
617
    if (!AtChar<B::Base,';'>(cur())) {
 
618
                Syntax_Error(NT_Reference);
 
619
    }
 
620
        else {
 
621
                Advance(1);
 
622
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
623
 
 
624
                //      The following code will replace Reference_Action.
 
625
                GEntity_info * this_info;
 
626
                Parser_Interface<W> * entity_parser;
 
627
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
628
                if (entityID == 0)
 
629
                        WF_Error(wfErr_wf_entdeclared);
 
630
                else{
 
631
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
632
                        if (this_info->is_external){
 
633
 
 
634
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
635
                                WF_Error(wfErr_NoExternalRefs);
 
636
                        else {
 
637
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
638
                                        entity_parser->Parse_WF_Content();
 
639
                                        if(!entity_parser->at_EOF())
 
640
                                                Syntax_Error(NT_content);
 
641
                                        entity_parser->~Parser_Interface<W>();
 
642
                        }
 
643
                        }
 
644
                        else {
 
645
                                if (this_info->is_simple == true);
 
646
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
647
                                else{
 
648
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
649
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
650
                                        entity_parser->Parse_WF_Content();
 
651
                                        if(!entity_parser->at_EOF())
 
652
                                                Syntax_Error(NT_content);
 
653
                                        entity_parser->~Parser_Interface<W>();
 
654
                                }
 
655
                        }
 
656
                }
 
657
 
 
658
        }
 
659
}
 
660
 
 
661
template <class B, WorkingCharacterSet W>
 
662
void ParsingEngine<B, W>::Parse_EntityRef_inMixed(symbol_set_t elems) {
 
663
    Advance(1);  // skip "&"
 
664
        int nameID = Parse_Name();  /* Name delimiter */
 
665
    if (!AtChar<B::Base,';'>(cur())) {
 
666
                Syntax_Error(NT_Reference);
 
667
    }
 
668
        else {
 
669
                Advance(1);
 
670
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
671
 
 
672
                //      The following code will replace Reference_Action.
 
673
                GEntity_info * this_info;
 
674
                Parser_Interface<W> * entity_parser;
 
675
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
676
                if (entityID == 0)
 
677
                        WF_Error(wfErr_wf_entdeclared);
 
678
                else{
 
679
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
680
                        if (this_info->is_external){
 
681
 
 
682
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
683
                                WF_Error(wfErr_NoExternalRefs);
 
684
                        else {
 
685
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
686
                                        entity_parser->Parse_MixedContent(elems);
 
687
                                        if(!entity_parser->at_EOF())
 
688
                                                Syntax_Error(NT_content);
 
689
                                        entity_parser->~Parser_Interface<W>();
 
690
                        }
 
691
                        }
 
692
                        else {
 
693
                                if (this_info->is_simple == true);
 
694
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
695
                                else{
 
696
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
697
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
698
                                        entity_parser->Parse_MixedContent(elems);
 
699
                                        if(!entity_parser->at_EOF())
 
700
                                                Syntax_Error(NT_content);
 
701
                                        entity_parser->~Parser_Interface<W>();
 
702
                                }
 
703
                        }
 
704
                }
 
705
 
 
706
        }
 
707
}
 
708
 
 
709
template <class B, WorkingCharacterSet W>
 
710
void ParsingEngine<B, W>::Parse_EntityRef_inAnyContent() {
 
711
    Advance(1);  // skip "&"
 
712
        int nameID = Parse_Name();  /* Name delimiter */
 
713
    if (!AtChar<B::Base,';'>(cur())) {
 
714
                Syntax_Error(NT_Reference);
 
715
    }
 
716
        else {
 
717
                Advance(1);
 
718
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
719
 
 
720
                //      The following code will replace Reference_Action.
 
721
                GEntity_info * this_info;
 
722
                Parser_Interface<W> * entity_parser;
 
723
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
724
                if (entityID == 0)
 
725
                        WF_Error(wfErr_wf_entdeclared);
 
726
                else{
 
727
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
728
                        if (this_info->is_external){
 
729
 
 
730
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
731
                                WF_Error(wfErr_NoExternalRefs);
 
732
                        else {
 
733
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
734
                                        entity_parser->Parse_AnyContent();
 
735
                                        if(!entity_parser->at_EOF())
 
736
                                                Syntax_Error(NT_content);
 
737
                                        entity_parser->~Parser_Interface<W>();
 
738
                        }
 
739
                        }
 
740
                        else {
 
741
                                if (this_info->is_simple == true);
 
742
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
743
                                else{
 
744
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
745
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
746
                                        entity_parser->Parse_AnyContent();
 
747
                                        if(!entity_parser->at_EOF())
 
748
                                                Syntax_Error(NT_content);
 
749
                                        entity_parser->~Parser_Interface<W>();
 
750
                                }
 
751
                        }
 
752
                }
 
753
 
 
754
        }
 
755
}
 
756
 
 
757
template <class B, WorkingCharacterSet W>
 
758
void ParsingEngine<B, W>::Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state) {
 
759
    Advance(1);  // skip "&"
 
760
        int nameID = Parse_Name();  /* Name delimiter */
 
761
    if (!AtChar<B::Base,';'>(cur())) {
 
762
                Syntax_Error(NT_Reference);
 
763
    }
 
764
        else {
 
765
                Advance(1);
 
766
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
767
 
 
768
                //      The following code will replace Reference_Action.
 
769
                GEntity_info * this_info;
 
770
                Parser_Interface<W> * entity_parser;
 
771
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
772
                if (entityID == 0)
 
773
                        WF_Error(wfErr_wf_entdeclared);
 
774
                else{
 
775
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
776
                        if (this_info->is_external){
 
777
 
 
778
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
779
                                WF_Error(wfErr_NoExternalRefs);
 
780
                        else {
 
781
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
782
                                        entity_parser->Parse_ValidContent(cre, cur_state);
 
783
                                        if(!entity_parser->at_EOF())
 
784
                                                Syntax_Error(NT_content);
 
785
                                        entity_parser->~Parser_Interface<W>();
 
786
                        }
 
787
                        }
 
788
                        else {
 
789
                                if (this_info->is_simple == true);
 
790
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
791
                                else{
 
792
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
793
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
794
                                        entity_parser->Parse_ValidContent(cre, cur_state);
 
795
                                        if(!entity_parser->at_EOF())
 
796
                                                Syntax_Error(NT_content);
 
797
                                        entity_parser->~Parser_Interface<W>();
 
798
                                }
 
799
                        }
 
800
                }
 
801
 
 
802
        }
 
803
}
 
804
 
 
805
template <class B, WorkingCharacterSet W>
 
806
void ParsingEngine<B, W>::Parse_CharRef() {
 
807
        Advance(2);  // skip "&#"
 
808
        int ch_val = 0;
 
809
        if (AtChar<B::Base,'x'>(cur())) {
 
810
                Advance(1);
 
811
                while(at_HexDigit<B::Base>(cur())){
 
812
                        ch_val = HexVal<B::Base>(cur()[0]) + (ch_val<<4);
 
813
                        if (ch_val> 0x10FFFF )
 
814
                                WF_Error(wfErr_wf_Legalchar);
 
815
                        Advance(1);
 
816
                }
 
817
        }
 
818
        else {
 
819
                while(at_Digit<B::Base>(cur())){
 
820
                        ch_val = DigitVal<B::Base>(cur()[0]) + ch_val*10;
 
821
                        if (ch_val> 0x10FFFF )
 
822
                                WF_Error(wfErr_wf_Legalchar);
 
823
                        Advance(1);
 
824
                }
 
825
        }
 
826
        if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF))
 
827
                                WF_Error(wfErr_wf_Legalchar);
 
828
                else  if (Parser_Interface<W>::entity_Info->version != XML_1_1)
 
829
                        if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA)))
 
830
                                WF_Error(wfErr_wf_Legalchar);
 
831
 
 
832
        if (!AtChar<B::Base,';'>(cur())) {
 
833
                        Syntax_Error(NT_CharRef);
 
834
        }
 
835
                else {
 
836
                        Advance(1);
 
837
                        Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
838
                }
 
839
}
 
840
 
 
841
template <class B, WorkingCharacterSet W>
 
842
void ParsingEngine<B, W>::Parse_PI (){
 
843
        int nameID;
 
844
        Advance(2); /* Skip "<?". */
 
845
        int target_start = AbsPos();
 
846
        if (at_XxMmLll<B::Base>(cur())) {
 
847
                nameID = Parse_Name();
 
848
                if (AbsPos() - target_start == 3) Syntax_Error(NT_PI);
 
849
        }
 
850
        else nameID = Parse_Name();
 
851
        PI_Target_action(GetCodeUnitPtr(target_start), LengthFrom(target_start));
 
852
        if (!at_PI_End<B::Base>(cur())) requireWS();
 
853
        ScanTo(QMark);
 
854
        while (!at_PI_End<B::Base>(cur())) {
 
855
                if(at_EOF())
 
856
                        Syntax_Error(NT_PI);
 
857
                Advance(1);
 
858
                ScanTo(QMark);
 
859
        }
 
860
        Advance(2); /* Skip "?>". */
 
861
        PI_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
862
}
 
863
 
 
864
/* Parse a start or empty element tag. */
 
865
template <class B, WorkingCharacterSet W>
 
866
inline void ParsingEngine<B, W>::Parse_StartTag (){
 
867
        int att_name_start;
 
868
        int att_val_start;
 
869
        int att_name_end, att_val_end;
 
870
        unsigned char quoteCh;
 
871
        Advance(1);
 
872
        int nameID = Parse_Name();  /* Name delimiter: WS, "/" or ">" */
 
873
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
874
        /* The following test optimizes the most common case of a
 
875
        start tag with no attributes.  */
 
876
        if (AtChar<B::Base,'>'>(cur())) {
 
877
                Advance(1);
 
878
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
879
        }
 
880
        else {
 
881
                ScanTo(NonWS);
 
882
                if (AtChar<B::Base,'>'>(cur())) {
 
883
                        Advance(1);
 
884
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
885
                }
 
886
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
887
                        Advance(2);
 
888
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
889
                }
 
890
                else do {
 
891
                        /* Must be an attribute-value pair or error. */
 
892
                        att_name_start = AbsPos();
 
893
                        int att_nameID = Parse_Name();
 
894
                        att_name_end = AbsPos();
 
895
 
 
896
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
897
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
898
                        else {
 
899
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
900
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
901
                                        break;
 
902
                                }
 
903
                        }
 
904
                        LastAttOccurrence[attID] = att_name_start;
 
905
                        /* The following optimized tests handle the frequently occurring
 
906
                        case that there are no blanks on either side of the equals sign.
 
907
                        In many cases, the very first test handles 100% of actual
 
908
                        attribute-value pairs encountered. */
 
909
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1);
 
910
                        else {
 
911
                                ScanTo(NonWS);
 
912
                                if (!AtChar<B::Base,'='>(cur())) {
 
913
                                        Syntax_Error(NT_STag);
 
914
                                        break;
 
915
                                }
 
916
                                Advance(1);
 
917
                                ScanTo(NonWS);
 
918
                                if (!AtQuote<B::Base>(cur())) {
 
919
                                        Syntax_Error(NT_STag);
 
920
                                        break;
 
921
                                }
 
922
                        }
 
923
                        att_val_start = AbsPos()+1;
 
924
                        Parse_AttValue();
 
925
                        att_val_end = AbsPos()-1;
 
926
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
927
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
928
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
929
                        }
 
930
                        else {
 
931
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
932
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
933
                        }
 
934
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
935
                        if (AtChar<B::Base,'>'>(cur())) {
 
936
                                Advance(1);
 
937
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
938
                                break;
 
939
                        }
 
940
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
941
                                Advance(2);
 
942
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
943
                                break;
 
944
                        }
 
945
                        ScanTo(NonWS);
 
946
                        if (AtChar<B::Base,'>'>(cur())) {
 
947
                                Advance(1);
 
948
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
949
                                break;
 
950
                        }
 
951
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
952
                                Advance(2);
 
953
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
954
                                break;
 
955
                        }
 
956
                        else if (AbsPos() == att_val_end + 1) {
 
957
                                /* No WS following att value */
 
958
                                Syntax_Error(NT_STag);
 
959
                                break;
 
960
                        }
 
961
                } while (1);
 
962
        }
 
963
}
 
964
 
 
965
template <class B, WorkingCharacterSet W>
 
966
inline void ParsingEngine<B, W>::text_if_nonnull_action(bool more){
 
967
        if (AbsPos() > text_or_markup_start) {
 
968
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), more);
 
969
                text_or_markup_start = AbsPos();
 
970
        }
 
971
}
 
972
 
 
973
template <class B, WorkingCharacterSet W>
 
974
void ParsingEngine<B, W>::Parse_WF_EndTag(int nameID) {
 
975
        Advance(2);
 
976
        int end_nameID = Parse_Name();
 
977
        if(end_nameID != nameID)
 
978
                WF_Error(wfErr_GIMatch);
 
979
        if (AtChar<B::Base,'>'>(cur())) {
 
980
                Advance(1);
 
981
                Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
982
        }
 
983
    else {
 
984
                ScanTo(NonWS);
 
985
                if (AtChar<B::Base,'>'>(cur())) {
 
986
                        Advance(1);
 
987
                        Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
988
                }
 
989
                else Syntax_Error(NT_ETag);
 
990
    }
 
991
}
 
992
 
 
993
// template <>
 
994
// void ParsingEngine<UTF8_Buffer, UTF_8>::Parse_WF_EndTag(int nameID) {
 
995
//      Advance(2); /* Skip "</". */
 
996
//
 
997
//      int name_start = AbsPos();
 
998
// //   ScanTo(NameFollow);
 
999
// //   int lgth = AbsPos()-name_start;
 
1000
//
 
1001
// #if (not defined(OMISSION)) or ((OMISSION != END_TAG_MATCHING)  and (OMISSION != NAME_LOOKUP))
 
1002
//      char * start_elem_name = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_name(nameID);
 
1003
//      int lgth = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_lgth(nameID);
 
1004
//      char * end_elem_name = &((char *) x8data)[buffer_rel_pos];
 
1005
//
 
1006
// #ifdef TEMPLATED_SIMD_LIB
 
1007
//      BytePack byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) end_elem_name),
 
1008
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
 
1009
// #endif
 
1010
// #ifndef TEMPLATED_SIMD_LIB
 
1011
//      BytePack byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) end_elem_name),
 
1012
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
 
1013
// #endif
 
1014
//      if (lgth < 16) {
 
1015
//              int expected_bits = ~(-1 << lgth);
 
1016
//          if ((_mm_movemask_epi8(byte_compare) & expected_bits) != expected_bits) {
 
1017
//                      WF_Error(wfErr_GIMatch);
 
1018
//          }
 
1019
//      }
 
1020
//      else {
 
1021
//          /* Must compare with bytes beyond the first 16.  Set up to
 
1022
//             compare 16 bytes at a time, with the first additional compare
 
1023
//             overlapping with the first byte_compare. */
 
1024
//          int pos = (lgth - 1) % PACKSIZE + 1;
 
1025
// #ifdef TEMPLATED_SIMD_LIB
 
1026
//          byte_compare =  simd_or(byte_compare, simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1027
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
 
1028
// #endif
 
1029
// #ifndef TEMPLATED_SIMD_LIB
 
1030
//          byte_compare =  simd_or(byte_compare, simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1031
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
 
1032
// #endif
 
1033
//          pos += 16;
 
1034
//          while (pos < lgth) {
 
1035
//              if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
 
1036
//                      WF_Error(wfErr_GIMatch);
 
1037
//              }
 
1038
// #ifdef TEMPLATED_SIMD_LIB
 
1039
//              byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1040
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
 
1041
// #endif
 
1042
// #ifndef TEMPLATED_SIMD_LIB
 
1043
//              byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1044
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
 
1045
// #endif
 
1046
//              pos += 16;
 
1047
//          }
 
1048
//          if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
 
1049
//                      WF_Error(wfErr_GIMatch);
 
1050
//          }
 
1051
//      }
 
1052
//      Advance(lgth);
 
1053
//
 
1054
// #endif
 
1055
// #if defined(OMISSION) and ((OMISSION == END_TAG_MATCHING) or (OMISSION == NAME_LOOKUP))
 
1056
//      ScanTo(NameFollow);
 
1057
// #endif
 
1058
// //   for(int i=0; i<lgth; i++) {
 
1059
// //           if (start_elem_name[i] != end_elem_name[i])
 
1060
// //                   WF_Error(wfErr_GIMatch);
 
1061
// //   }
 
1062
// //   if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
 
1063
//
 
1064
//      if (AtChar<ASCII,'>'>(cur())) {
 
1065
//              Advance(1);
 
1066
//              Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1067
//      }
 
1068
//     else {
 
1069
//              ScanTo(NonWS);
 
1070
//              if (AtChar<ASCII,'>'>(cur())) {
 
1071
//                      Advance(1);
 
1072
//                      Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1073
//              }
 
1074
//              else Syntax_Error(NT_ETag);
 
1075
//     }
 
1076
// }
 
1077
 
 
1078
 
 
1079
/* Parse a valid start or empty element tag. */
 
1080
template <class B, WorkingCharacterSet W>
 
1081
int ParsingEngine<B, W>::Parse_WF_StartTag (bool& is_emptyStartTag){
 
1082
        int att_name_start;
 
1083
        int att_val_start;
 
1084
        int att_name_end, att_val_end;
 
1085
        unsigned char quoteCh;
 
1086
        Advance(1);
 
1087
 
 
1088
        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
 
1089
        int nameID = Parse_Name();
 
1090
        #endif
 
1091
        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
 
1092
        ScanTo(NameFollow);
 
1093
        int nameID = 0;
 
1094
        #endif
 
1095
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
1096
        /* The following test optimizes the most common case of a
 
1097
        start tag with no attributes.  */
 
1098
        if (AtChar<B::Base,'>'>(cur())) {
 
1099
                Advance(1);
 
1100
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1101
        }
 
1102
        else {
 
1103
                ScanTo(NonWS);
 
1104
                if (AtChar<B::Base,'>'>(cur())) {
 
1105
                        Advance(1);
 
1106
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1107
                }
 
1108
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1109
                        Advance(2);
 
1110
                        is_emptyStartTag = true;
 
1111
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1112
                }
 
1113
                else do {
 
1114
                        /* Must be an attribute-value pair or error. */
 
1115
                        att_name_start = AbsPos();
 
1116
                        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
 
1117
                        int att_nameID = Parse_Name();
 
1118
                        #endif
 
1119
                        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
 
1120
                        ScanTo(NameFollow);
 
1121
                        int att_nameID = 0;
 
1122
                        #endif
 
1123
            att_name_end = AbsPos();
 
1124
                #if (not defined(OMISSION)) or ((OMISSION != ATTRIBUTE_UNIQUENESS) and (OMISSION != NAME_LOOKUP))
 
1125
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
1126
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
1127
                        else {
 
1128
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
1129
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
1130
                                        break;
 
1131
                                }
 
1132
                        }
 
1133
                        LastAttOccurrence[attID] = att_name_start;
 
1134
                 #endif
 
1135
                        /* The following optimized tests handle the frequently occurring
 
1136
                        case that there are no blanks on either side of the equals sign.
 
1137
                        In many cases, the very first test handles 100% of actual
 
1138
                        attribute-value pairs encountered. */
 
1139
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1);
 
1140
                        else {
 
1141
                                ScanTo(NonWS);
 
1142
                                if (!AtChar<B::Base,'='>(cur())) {
 
1143
                                        Syntax_Error(NT_STag);
 
1144
                                        break;
 
1145
                                }
 
1146
                                Advance(1);
 
1147
                                ScanTo(NonWS);
 
1148
                                if (!AtQuote<B::Base>(cur())) {
 
1149
                                        Syntax_Error(NT_STag);
 
1150
                                        break;
 
1151
                                }
 
1152
                        }
 
1153
                        att_val_start = AbsPos()+1;
 
1154
                        Parse_AttValue();
 
1155
                        att_val_end = AbsPos()-1;
 
1156
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
1157
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
1158
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
1159
                        }
 
1160
                        else {
 
1161
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
1162
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
1163
                        }
 
1164
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
1165
                        if (AtChar<B::Base,'>'>(cur())) {
 
1166
                                Advance(1);
 
1167
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1168
                                break;
 
1169
                        }
 
1170
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1171
                                Advance(2);
 
1172
                                is_emptyStartTag = true;
 
1173
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1174
                                break;
 
1175
                        }
 
1176
                        ScanTo(NonWS);
 
1177
                        if (AtChar<B::Base,'>'>(cur())) {
 
1178
                                Advance(1);
 
1179
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1180
                                break;
 
1181
                        }
 
1182
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1183
                                Advance(2);
 
1184
                                is_emptyStartTag = true;
 
1185
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1186
                                break;
 
1187
                        }
 
1188
                        else if (AbsPos() == att_val_end + 1) {
 
1189
                                /* No WS following att value */
 
1190
                                Syntax_Error(NT_STag);
 
1191
                                break;
 
1192
                        }
 
1193
                } while (1);
 
1194
        }
 
1195
        return nameID;
 
1196
}
 
1197
 
 
1198
 
 
1199
 
 
1200
template <class B, WorkingCharacterSet W>
 
1201
void ParsingEngine<B, W>::Parse_WF_Element() {
 
1202
        bool is_emptyStartTag = false;
 
1203
        int nameID = Parse_WF_StartTag(is_emptyStartTag);
 
1204
#ifdef DEBUG
 
1205
        printf("Parse_Element: nameID = %d, is_emptyStartTag=%i\n",nameID, is_emptyStartTag);
 
1206
#endif
 
1207
        if (!is_emptyStartTag) {
 
1208
                Parse_WF_Content();
 
1209
                Parse_WF_EndTag(nameID);
 
1210
        }
 
1211
}
 
1212
 
 
1213
 
 
1214
template <class B, WorkingCharacterSet W>
 
1215
void ParsingEngine<B, W>::Parse_WF_Content() {
 
1216
        do {
 
1217
                text_or_markup_start = AbsPos();
 
1218
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
1219
                if (at_ElementTag_Start<B::Base>(cur())) {
 
1220
                        text_if_nonnull_action(false);
 
1221
                        Parse_WF_Element();
 
1222
                }
 
1223
                else if (at_EndTag_Start<B::Base>(cur())) {
 
1224
                        text_if_nonnull_action(false);
 
1225
                        return;
 
1226
                }
 
1227
                else if (at_Comment_Start<B::Base>(cur())) {
 
1228
                        text_if_nonnull_action(false);
 
1229
                        Parse_Comment();
 
1230
                }
 
1231
                else if (at_CharRef_Start<B::Base>(cur())) {
 
1232
                        text_if_nonnull_action(true);
 
1233
                        Parse_CharRef();
 
1234
                }
 
1235
                else if (AtChar<B::Base,'&'>(cur())) {
 
1236
                        text_if_nonnull_action(true);
 
1237
                        Parse_EntityRef();
 
1238
                }
 
1239
                else if (at_CDATA_Start<B::Base>(cur())) {
 
1240
                        text_if_nonnull_action(true);
 
1241
                        Parse_CDATA();
 
1242
                }
 
1243
                else if (at_PI_Start<B::Base>(cur())) {
 
1244
                        text_if_nonnull_action(false);
 
1245
                        Parse_PI();
 
1246
                }
 
1247
                else if (at_CDATA_End<B::Base>(cur())) {
 
1248
                        text_if_nonnull_action(true);
 
1249
                        Advance(3);
 
1250
                        Syntax_Error(NT_CharData);
 
1251
                }
 
1252
                else if (at_EOF()) {
 
1253
                        text_if_nonnull_action(false);
 
1254
                        return;
 
1255
                }
 
1256
                else if (AtChar<B::Base,'<'>(cur())) {
 
1257
                        Syntax_Error(NT_markupdecl);
 
1258
                }
 
1259
                else {
 
1260
                        Advance(1);
 
1261
                        continue;
 
1262
                }
 
1263
        } while (1);
 
1264
}
 
1265
 
 
1266
 
 
1267
#ifndef MARKUP_PASS_CONTROL
 
1268
#ifndef MARKUP_SORTING
 
1269
template <class B, WorkingCharacterSet W>
 
1270
void ParsingEngine<B, W>::ParseContent() {
 
1271
        Parser_Interface<W>::DocumentStart_action();
 
1272
        bool is_emptyStartTag = false;
 
1273
        do {
 
1274
                text_or_markup_start = AbsPos();
 
1275
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
1276
/*              if (AtChar<B::Base,'<'>(cur())) {
 
1277
                        text_if_nonnull_action();
 
1278
                        Parse_Markup<B, W>();
 
1279
                }*/
 
1280
                if (at_ElementTag_Start<B::Base>(cur())) {
 
1281
                        text_if_nonnull_action(false);
 
1282
                        Parse_StartTag();
 
1283
                }
 
1284
                else if (at_EndTag_Start<B::Base>(cur())) {
 
1285
                        text_if_nonnull_action(false);
 
1286
                        Parse_EndTag();
 
1287
                }
 
1288
                else if (at_Comment_Start<B::Base>(cur())) {
 
1289
                        text_if_nonnull_action(false);
 
1290
                        Parse_Comment();
 
1291
                }
 
1292
                else if (at_CharRef_Start<B::Base>(cur())) {
 
1293
                        text_if_nonnull_action(true);
 
1294
                        Parse_CharRef();
 
1295
                }
 
1296
                else if (AtChar<B::Base,'&'>(cur())) {
 
1297
                        text_if_nonnull_action(true);
 
1298
                        Parse_EntityRef();
 
1299
                }
 
1300
                else if (at_CDATA_Start<B::Base>(cur())) {
 
1301
                        text_if_nonnull_action(true);
 
1302
                        Parse_CDATA();
 
1303
                }
 
1304
                else if (at_PI_Start<B::Base>(cur())) {
 
1305
                        text_if_nonnull_action(false);
 
1306
                        Parse_PI();
 
1307
                }
 
1308
                else if (at_CDATA_End<B::Base>(cur())) {
 
1309
                        text_if_nonnull_action(true);
 
1310
                        Advance(3);
 
1311
                        Syntax_Error(NT_CharData);
 
1312
                }
 
1313
                else if (at_EOF()) {
 
1314
                        text_if_nonnull_action(false);
 
1315
                        break;
 
1316
                }
 
1317
                else if (AtChar<B::Base,'<'>(cur())) {
 
1318
                        Syntax_Error(NT_markupdecl);
 
1319
                }
 
1320
                else {
 
1321
                        Advance(1);
 
1322
                        continue;
 
1323
                }
 
1324
        } while (1);
 
1325
        Parser_Interface<W>::DocumentEnd_action();
 
1326
}
 
1327
#endif
 
1328
#endif
 
1329
 
 
1330
template <class B, WorkingCharacterSet W>
 
1331
void ParsingEngine<B, W>::Parse_DocType (){
 
1332
 
 
1333
        int old_abspos, start_pos;
 
1334
        ScanTo(NonWS);
 
1335
        start_pos = AbsPos();
 
1336
 
 
1337
        if (at_DOCTYPE_start<B::Base>(cur()))
 
1338
        Advance(9);
 
1339
        else{
 
1340
//              printf("No Document definition!\n");
 
1341
                return;
 
1342
        }
 
1343
        requireWS();
 
1344
        int nameID = Parse_Name();
 
1345
 
 
1346
        old_abspos = AbsPos();
 
1347
    ScanTo(NonWS);
 
1348
    if(at_SYSTEM<B::Base>(cur())||at_PUBLIC<B::Base>(cur())){
 
1349
        Parser_Interface<W>::model_info->has_external_DTD = true;
 
1350
        if(old_abspos == AbsPos())
 
1351
                Syntax_Error(NT_doctypedecl);
 
1352
        Parse_ExternalID(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info->external_DTD_pubidLiteral);
 
1353
        Parser_Interface<W> * entity_parser;
 
1354
        entity_parser = ParserFactory(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info);
 
1355
                entity_parser->Parse_ExtSubsetDecl();
 
1356
                entity_parser->~Parser_Interface<W>();
 
1357
    }
 
1358
    else Parser_Interface<W>::model_info->has_external_DTD = false;
 
1359
    ScanTo(NonWS);
 
1360
 
 
1361
        if (AtChar<B::Base,'['>(cur())){
 
1362
                Advance(1);
 
1363
                Parse_IntSubset();
 
1364
                if (AtChar<B::Base,']'>(cur()))
 
1365
                        Advance(1);
 
1366
                else
 
1367
                Syntax_Error(NT_doctypedecl);
 
1368
                ScanTo(NonWS);
 
1369
        }
 
1370
 
 
1371
        if (AtChar<B::Base,'>'>(cur())){
 
1372
                Advance(1);
 
1373
 
 
1374
                CRE_Seq * rslt = new CRE_Seq();
 
1375
                rslt->subCMs.push_back(new CRE_Name(nameID));
 
1376
                CM_RegExp * cre = new CM_RegExp();
 
1377
                cre->content_re = rslt;
 
1378
 
 
1379
                int id_count = cre->content_re->Set_IDs(0);
 
1380
                cre->content_re->Set_First_Map();
 
1381
                symbol_set_t * transition_map = new symbol_set_t[id_count+1];
 
1382
                cre->content_re->follow_map[0] = id_count+1;
 
1383
 
 
1384
                cre->content_re->Set_Follow_Map(transition_map);
 
1385
                transition_map[0] = cre->content_re->first_map;
 
1386
                if (cre->content_re->matches_empty)
 
1387
                        transition_map[0][0]=id_count+1;
 
1388
 
 
1389
                cre -> transition_map = transition_map;
 
1390
 
 
1391
                Parser_Interface<W>::model_info->rootModel = cre;
 
1392
 
 
1393
                /* Check for notations that were used, but not defined by the end of the DTD. */
 
1394
                #if (VALIDATION_MODE == ON)
 
1395
                hash_map<int, int >::iterator j;
 
1396
                for (j=Parser_Interface<W>::model_info->GlobalNotationTable.begin(); j!=Parser_Interface<W>::model_info->GlobalNotationTable.end(); j++) {
 
1397
                        if (j->second == -1)
 
1398
                                Validity_Error(vErr_notatn);
 
1399
                }
 
1400
                #endif
 
1401
        }
 
1402
        else
 
1403
                Syntax_Error(NT_doctypedecl);
 
1404
}
 
1405
 
 
1406
template <class B, WorkingCharacterSet W>
 
1407
void ParsingEngine<B, W>::Parse_ExternalID (char *& systemLiteral, char *& pubidLiteral){
 
1408
        int quot_start, lgth;
 
1409
        if(at_SYSTEM<B::Base>(cur())){
 
1410
                Advance(6);
 
1411
                pubidLiteral = NULL;
 
1412
                requireWS();
 
1413
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
 
1414
                quot_start = AbsPos()+1;
 
1415
                Parse_SystemLiteral (); /*  SystemLiteral */
 
1416
                lgth = AbsPos() - quot_start - 1;
 
1417
                systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1418
        }
 
1419
        else if (at_PUBLIC<B::Base>(cur())){
 
1420
                Advance(6);
 
1421
                requireWS();
 
1422
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
 
1423
                quot_start = AbsPos()+1;
 
1424
                Parse_PubidLiteral ();/*  PubidLiteral */
 
1425
                lgth = AbsPos() - quot_start - 1;
 
1426
                pubidLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1427
                systemLiteral = NULL;
 
1428
                if (AtChar<B::Base, '>'>(cur())) return;
 
1429
                requireWS();
 
1430
                if (AtQuote<B::Base>(cur())) {
 
1431
                        quot_start = AbsPos()+1;
 
1432
                        Parse_SystemLiteral ();/*  SystemLiteral */
 
1433
                        lgth = AbsPos() - quot_start - 1;
 
1434
                        systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1435
                }
 
1436
        }
 
1437
        else
 
1438
                Syntax_Error(NT_ExternalID);
 
1439
}
 
1440
 
 
1441
template <class B, WorkingCharacterSet W>
 
1442
void ParsingEngine<B, W>::Parse_SystemLiteral (){
 
1443
        unsigned char quoteCh;
 
1444
        if(AtQuote<B::Base>(cur())){
 
1445
                quoteCh = cur()[0];
 
1446
                Advance(1);
 
1447
        }
 
1448
        ScanTo(Quote);
 
1449
        while (cur()[0] != quoteCh){
 
1450
                if(at_EOF())
 
1451
                        Syntax_Error(NT_SystemLiteral);
 
1452
                Advance(1);
 
1453
                ScanTo(Quote);
 
1454
        }
 
1455
        Advance(1);
 
1456
}
 
1457
 
 
1458
template <class B, WorkingCharacterSet W>
 
1459
void ParsingEngine<B, W>::Parse_PubidLiteral (){
 
1460
        unsigned char quoteCh;
 
1461
        quoteCh = cur()[0];
 
1462
        Advance(1);
 
1463
        while (at_PubidChar<B::Base>(cur()) && (cur()[0] != quoteCh)) {
 
1464
                Advance(1);
 
1465
        }
 
1466
        if (cur()[0] != quoteCh){
 
1467
                Syntax_Error(NT_PubidLiteral);
 
1468
        }
 
1469
        Advance(1);
 
1470
}
 
1471
 
 
1472
template <class B, WorkingCharacterSet W>
 
1473
void ParsingEngine<B, W>::Parse_IntSubset (){
 
1474
 
 
1475
        while(1){
 
1476
                ScanTo(NonWS);
 
1477
                text_or_markup_start = AbsPos();
 
1478
                if (AtChar<B::Base,'%'>(cur()))
 
1479
                        Parse_PEReference();
 
1480
                else if (at_PI_Start<B::Base>(cur())) {
 
1481
                        Parse_PI();
 
1482
                }
 
1483
                else if (at_Comment_Start<B::Base>(cur())) {
 
1484
                        Parse_Comment();
 
1485
                }
 
1486
                else if (AtChar<B::Base,'<'>(cur())){
 
1487
                        Advance(1);
 
1488
                        if(AtChar<B::Base,'!'>(cur())){
 
1489
                                Advance(1);
 
1490
                                if (at_ELEMENT<B::Base>(cur()))
 
1491
                                        Parse_Elementdecl();
 
1492
                                else if (at_ATTLIST<B::Base>(cur()))
 
1493
                                        Parse_AttlistDecl();
 
1494
                                else if (at_ENTITY<B::Base>(cur()))
 
1495
                                        Parse_Entitydecl();
 
1496
                                else if (at_NOTATION<B::Base>(cur()))
 
1497
                                        Parse_Notationdecl();
 
1498
                                else {
 
1499
                                        Syntax_Error(NT_markupdecl);
 
1500
                                }
 
1501
                        }
 
1502
                        else
 
1503
                                Syntax_Error(NT_markupdecl);
 
1504
                }
 
1505
                else if (AtChar<B::Base,']'>(cur())){
 
1506
                        break;
 
1507
                }
 
1508
                else
 
1509
                        Syntax_Error(NT_intSubset);
 
1510
        }
 
1511
}
 
1512
 
 
1513
 
 
1514
template <class B, WorkingCharacterSet W>
 
1515
void ParsingEngine<B, W>::Parse_PEReference (){
 
1516
 
 
1517
        Advance(1); /* Skip "%". */
 
1518
        fprintf(stderr,"Parameter Reference has not been completed yet.\n");
 
1519
        exit(-1);
 
1520
        int nameID = Parse_Name();
 
1521
        if (AtChar<B::Base,';'>(cur())) {
 
1522
                Advance(1);
 
1523
                PEReference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1524
                PEntity_info * this_info;
 
1525
                Parser_Interface<W> * entity_parser;
 
1526
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
 
1527
                if (entityID == 0)
 
1528
                        WF_Error(wfErr_wf_entdeclared);
 
1529
                else{
 
1530
                        this_info = Parser_Interface<W>::model_info->PEntityData[entityID-1];
 
1531
                        if (this_info->is_external){
 
1532
 
 
1533
//                      if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
1534
//                              WF_Error(wfErr_NoExternalRefs);
 
1535
//                      else {
 
1536
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
1537
                                        entity_parser->Parse_WF_Content();
 
1538
                                        if(!entity_parser->at_EOF())
 
1539
                                                Syntax_Error(NT_content);
 
1540
                                        entity_parser->~Parser_Interface<W>();
 
1541
//                      }
 
1542
                        }
 
1543
                        else {
 
1544
                        }
 
1545
                }
 
1546
        }
 
1547
        else
 
1548
                Syntax_Error(NT_PEReference);
 
1549
}
 
1550
 
 
1551
 
 
1552
template <class B, WorkingCharacterSet W>
 
1553
void ParsingEngine<B, W>::Parse_Elementdecl (){
 
1554
 
 
1555
        Advance(7); /* Skip "<!ELEMENT". */
 
1556
 
 
1557
    requireWS();
 
1558
        int nameID = Parse_Name();
 
1559
        int elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
 
1560
 
 
1561
        requireWS();
 
1562
        ContentModel * cm;
 
1563
        /* Start parsing "contentspec"*/
 
1564
        if (at_EMPTY<B::Base>(cur())) {
 
1565
        Advance(5);
 
1566
        cm = new CM_Empty();
 
1567
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1568
        }
 
1569
    else if (at_ANY<B::Base>(cur())) {
 
1570
        Advance(3);
 
1571
        cm = new CM_Any();
 
1572
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1573
    }
 
1574
    else {
 
1575
        if (AtChar<B::Base,'('>(cur()))
 
1576
                        Advance(1);
 
1577
                ScanTo(NonWS);
 
1578
                if (at_PCDATA<B::Base>(cur())){
 
1579
                        cm = Parse_RemainingMixed();
 
1580
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1581
                }
 
1582
                else{
 
1583
 
 
1584
                        CM_RegExp * cre = new CM_RegExp;
 
1585
                        cre->content_re = Parse_RemainingChildren();
 
1586
 
 
1587
                        int id_count = cre->content_re->Set_IDs(0);
 
1588
                        cre->content_re->Set_First_Map();
 
1589
                        symbol_set_t * transition_map = new symbol_set_t[id_count+1];
 
1590
                        cre->content_re->follow_map[0] = id_count+1;
 
1591
 
 
1592
                        cre->content_re->Set_Follow_Map(transition_map);
 
1593
                        transition_map[0] = cre->content_re->first_map;
 
1594
 
 
1595
                        if (cre->content_re->matches_empty)
 
1596
                                transition_map[0][0]=id_count+1;
 
1597
 
 
1598
                        cre -> transition_map = transition_map;
 
1599
 
 
1600
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cre;
 
1601
                        cm = cre;
 
1602
                }
 
1603
    }
 
1604
    ScanTo(NonWS);
 
1605
 
 
1606
        if (AtChar<B::Base,'>'>(cur())) {
 
1607
                Advance(1);
 
1608
        }
 
1609
        else
 
1610
                Syntax_Error(NT_elementdecl);
 
1611
}
 
1612
template <class B, WorkingCharacterSet W>
 
1613
ContentModel * ParsingEngine<B, W>::Parse_RemainingMixed (){
 
1614
        CM_Mixed * r = new CM_Mixed();
 
1615
        Advance(7);  /* Skip "#PCDATA". */
 
1616
 
 
1617
    if (AtChar<B::Base,')'>(cur())){
 
1618
        if (AtChar<B::Base,'*'>(cur())) {
 
1619
                Advance(2);
 
1620
                }
 
1621
                else {
 
1622
                        Advance(1);
 
1623
                }
 
1624
    }
 
1625
    else{
 
1626
        ScanTo(NonWS);
 
1627
        int k = 0;
 
1628
        while (AtChar<B::Base,'|'>(cur())){
 
1629
                        Advance(1);
 
1630
                        ScanTo(NonWS);
 
1631
                        int nameID = Parse_Name();
 
1632
                        r->elements[nameID] = ++k;
 
1633
                        ScanTo(NonWS);
 
1634
                }
 
1635
                if (at_Para_star<B::Base>(cur())) Advance(2);
 
1636
                else {
 
1637
                        Syntax_Error(NT_Mixed);
 
1638
                        exit(-1);
 
1639
        }
 
1640
    }
 
1641
    return r;
 
1642
}
 
1643
 
 
1644
 
 
1645
template <class B, WorkingCharacterSet W>
 
1646
Content_RE * ParsingEngine<B, W>::Parse_RemainingChildren (){
 
1647
        Content_RE * c1 = Parse_Cp();
 
1648
        Content_RE * r = c1;
 
1649
        ScanTo(NonWS);
 
1650
        if(AtChar<B::Base,'|'>(cur())){
 
1651
                CRE_Choice * rslt = new CRE_Choice;
 
1652
                rslt->subCMs.push_back(c1);
 
1653
                Advance(1);
 
1654
                ScanTo(NonWS);
 
1655
                rslt->subCMs.push_back(Parse_Cp());
 
1656
                ScanTo(NonWS);
 
1657
                while(!AtChar<B::Base,')'>(cur())){
 
1658
                        if(AtChar<B::Base,'|'>(cur()))
 
1659
                                Advance(1);
 
1660
                        else
 
1661
                                Syntax_Error(NT_children);
 
1662
                        ScanTo(NonWS);
 
1663
                        rslt->subCMs.push_back(Parse_Cp());
 
1664
                        ScanTo(NonWS);
 
1665
                }
 
1666
                Advance(1);
 
1667
                rslt->Compile();
 
1668
                r = rslt;
 
1669
        }
 
1670
        else if(AtChar<B::Base,','>(cur())){
 
1671
                CRE_Seq * rslt = new CRE_Seq;
 
1672
                rslt->subCMs.push_back(c1);
 
1673
                Advance(1);
 
1674
                ScanTo(NonWS);
 
1675
                rslt->subCMs.push_back(Parse_Cp());
 
1676
                ScanTo(NonWS);
 
1677
                while(!AtChar<B::Base,')'>(cur())){
 
1678
                        if(AtChar<B::Base,','>(cur()))
 
1679
                                Advance(1);
 
1680
                        else
 
1681
                                Syntax_Error(NT_children);
 
1682
                        ScanTo(NonWS);
 
1683
                        rslt->subCMs.push_back(Parse_Cp());
 
1684
                        ScanTo(NonWS);
 
1685
                }
 
1686
                Advance(1);
 
1687
                rslt->Compile();
 
1688
                r = rslt;
 
1689
        }
 
1690
        else if(AtChar<B::Base,')'>(cur())){
 
1691
                Advance(1);
 
1692
        }
 
1693
        else
 
1694
                Syntax_Error(NT_children);
 
1695
 
 
1696
        if (AtChar<B::Base,'?'>(cur())) {
 
1697
                Advance(1);
 
1698
                r = new CRE_Opt(r);
 
1699
        }
 
1700
        else if (AtChar<B::Base,'*'>(cur())) {
 
1701
                Advance(1);
 
1702
                r = new CRE_Star(r);
 
1703
        }
 
1704
        else if (AtChar<B::Base,'+'>(cur())) {
 
1705
                Advance(1);
 
1706
                r = new CRE_Plus(r);
 
1707
        }
 
1708
 
 
1709
        return r;
 
1710
}
 
1711
 
 
1712
template <class B, WorkingCharacterSet W>
 
1713
Content_RE * ParsingEngine<B, W>::Parse_Cp (){
 
1714
        if (AtChar<B::Base,'('>(cur())){
 
1715
                Advance(1);
 
1716
                ScanTo(NonWS);
 
1717
                Parse_RemainingChildren();
 
1718
        }
 
1719
        else{
 
1720
                int nameID = Parse_Name();
 
1721
                CRE_Name * r = new CRE_Name(nameID);
 
1722
 
 
1723
                if (AtChar<B::Base,'?'>(cur())) {
 
1724
                        Advance(1);
 
1725
                        return new CRE_Opt(r);
 
1726
                }
 
1727
                else if (AtChar<B::Base,'*'>(cur())) {
 
1728
                        Advance(1);
 
1729
                        return new CRE_Star(r);
 
1730
                }
 
1731
                else if (AtChar<B::Base,'+'>(cur())) {
 
1732
                        Advance(1);
 
1733
                        return new CRE_Plus(r);
 
1734
                }
 
1735
                else return r;
 
1736
        }
 
1737
}
 
1738
 
 
1739
template <class B, WorkingCharacterSet W>
 
1740
void ParsingEngine<B, W>::Parse_AttlistDecl (){
 
1741
 
 
1742
        int old_abspos;
 
1743
 
 
1744
        int name_start;
 
1745
        int lgth;
 
1746
 
 
1747
        int elemID;
 
1748
        int attID;
 
1749
 
 
1750
        Advance(7); /* Skip "ATTLIST. */
 
1751
        requireWS();
 
1752
 
 
1753
        int nameID = Parse_Name();
 
1754
        elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
 
1755
 
 
1756
        old_abspos = AbsPos();
 
1757
        ScanTo(NonWS);
 
1758
        while(!AtChar<B::Base,'>'>(cur())) {
 
1759
                if(old_abspos == AbsPos())
 
1760
                Syntax_Error(NT_AttlistDecl);
 
1761
 
 
1762
                int att_nameID = Parse_Name();
 
1763
 
 
1764
                attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
1765
                if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
1766
        ATT_info * this_info = new ATT_info;
 
1767
        this_info->globalATT_id = attID;
 
1768
        requireWS();
 
1769
        if (at_CDATA<B::Base>(cur())){
 
1770
                Advance(5);
 
1771
                this_info->attType = CDATA_att;
 
1772
        }
 
1773
        else if(at_ID<B::Base>(cur())){
 
1774
                Advance(2);
 
1775
                this_info->attType = ID_att;
 
1776
        }
 
1777
        /* Make sure to check IDREFS before IDREF*/
 
1778
        else if(at_IDREFS<B::Base>(cur())){
 
1779
                Advance(6);
 
1780
                this_info->attType = IDREFS_att;
 
1781
        }
 
1782
        else if(at_IDREF<B::Base>(cur())){
 
1783
                Advance(5);
 
1784
                this_info->attType = IDREF_att;
 
1785
        }
 
1786
        else if(at_ENTITY<B::Base>(cur())){
 
1787
                Advance(6);
 
1788
                this_info->attType = ENTITY_att;
 
1789
        }
 
1790
        else if(at_ENTITIES<B::Base>(cur())){
 
1791
                Advance(8);
 
1792
                this_info->attType = ENTITIES_att;
 
1793
        }
 
1794
        /* Make sure to check NMTOKENS before NMTOKEN*/
 
1795
        else if(at_NMTOKENS<B::Base>(cur())){
 
1796
                Advance(8);
 
1797
                this_info->attType = NMTOKENS_att;
 
1798
        }
 
1799
        else if(at_NMTOKEN<B::Base>(cur())){
 
1800
                Advance(7);
 
1801
                this_info->attType = NMTOKEN_att;
 
1802
        }
 
1803
        else if(at_NOTATION<B::Base>(cur())){ /* NotationType = 'NOTATION' S Enumeration
 
1804
                                                                         when Nmtoken = Name */
 
1805
                Advance(8);
 
1806
                        requireWS();
 
1807
                Parse_Notation(this_info);
 
1808
                this_info->attType = NOTATION_att;
 
1809
        }
 
1810
        else if(AtChar<B::Base,'('>(cur())){
 
1811
                Parse_Enumeration(this_info);
 
1812
                this_info->attType = enumeration_att;
 
1813
        }
 
1814
        else
 
1815
                Syntax_Error(NT_AttlistDecl);
 
1816
        requireWS();
 
1817
        Parse_DefaultDecl(this_info);
 
1818
 
 
1819
                ScanTo(NonWS);
 
1820
                Parser_Interface<W>::model_info->ElementAttributeData[elemID].push_back(this_info);
 
1821
        }
 
1822
 
 
1823
        Advance(1);
 
1824
}
 
1825
 
 
1826
template <class B, WorkingCharacterSet W>
 
1827
void ParsingEngine<B, W>::Parse_Notation (ATT_info * this_info){
 
1828
 
 
1829
        if(AtChar<B::Base,'('>(cur()))
 
1830
                Advance(1);
 
1831
        else
 
1832
                Syntax_Error(NT_NotationType);
 
1833
        ScanTo(NonWS);
 
1834
 
 
1835
    int notn_nameID = Parse_Name();
 
1836
 
 
1837
        /*Notation name is not in the global table!*/
 
1838
        if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
 
1839
                Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
 
1840
 
 
1841
        ScanTo(NonWS);
 
1842
        while(AtChar<B::Base,'|'>(cur())){
 
1843
                Advance(1);
 
1844
                ScanTo(NonWS);
 
1845
                notn_nameID = Parse_Name();
 
1846
 
 
1847
                if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
 
1848
//                      Validity_Error(vErr_notatn);
 
1849
                        Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
 
1850
 
 
1851
                ScanTo(NonWS);
 
1852
        }
 
1853
        if (AtChar<B::Base,')'>(cur()))
 
1854
                Advance(1);
 
1855
        else
 
1856
                Syntax_Error(NT_NotationType);
 
1857
}
 
1858
 
 
1859
template <class B, WorkingCharacterSet W>
 
1860
void ParsingEngine<B, W>::Parse_Enumeration (ATT_info * this_info){
 
1861
 
 
1862
        int enumCount=0;
 
1863
        if(AtChar<B::Base,'('>(cur()))
 
1864
                Advance(1);
 
1865
        else
 
1866
                Syntax_Error(NT_Enumeration);
 
1867
        ScanTo(NonWS);
 
1868
 
 
1869
        int nmtokenID = Parse_Nmtoken();
 
1870
 
 
1871
        this_info->enumValues[nmtokenID]=++(enumCount);
 
1872
 
 
1873
        ScanTo(NonWS);
 
1874
        while(AtChar<B::Base,'|'>(cur())){
 
1875
                Advance(1);
 
1876
                ScanTo(NonWS);
 
1877
                int nmtokenID = Parse_Nmtoken();
 
1878
 
 
1879
                int enumID = this_info->enumValues[nmtokenID];
 
1880
                if(enumID==0){
 
1881
                        this_info->enumValues[nmtokenID]=++(enumCount);
 
1882
                        enumID = enumCount;
 
1883
                }
 
1884
                else if(!StrictWellFormedness){
 
1885
                        Validity_Error(vErr_NoDuplicateTokens);
 
1886
                }
 
1887
                ScanTo(NonWS);
 
1888
        }
 
1889
        if (AtChar<B::Base,')'>(cur()))
 
1890
                Advance(1);
 
1891
        else
 
1892
                Syntax_Error(NT_Enumeration);
 
1893
}
 
1894
 
 
1895
template <class B, WorkingCharacterSet W>
 
1896
void ParsingEngine<B, W>::Parse_DefaultDecl (ATT_info * this_info){
 
1897
        if(at_REQUIRED<B::Base>(cur())){
 
1898
                Advance(9);
 
1899
                this_info->defaultKind = REQUIRED_att;
 
1900
        }
 
1901
        else if(at_IMPLIED<B::Base>(cur())){
 
1902
                Advance(8);
 
1903
                this_info->defaultKind = IMPLIED_att;
 
1904
        }
 
1905
        else {
 
1906
                if(at_FIXED<B::Base>(cur())){
 
1907
                        Advance(6);
 
1908
                        requireWS();
 
1909
                        this_info->defaultKind = FIXED_att;
 
1910
                }
 
1911
                else this_info->defaultKind = DEFAULT_att;
 
1912
                if(AtQuote<B::Base>(cur())){
 
1913
                        int quot_start = AbsPos()+1;
 
1914
                        Parse_AttValue();
 
1915
                        /* need to normalize */
 
1916
                        this_info->defaultValueLgth = AbsPos() - quot_start - 1;
 
1917
 
 
1918
                        this_info->defaultValue = new unsigned char[this_info->defaultValueLgth+1];
 
1919
                        memcpy(this_info->defaultValue, GetCodeUnitPtr(quot_start),this_info->defaultValueLgth);
 
1920
                        this_info->defaultValue[this_info->defaultValueLgth] = '\0';
 
1921
                        }
 
1922
                else
 
1923
                        Syntax_Error(NT_DefaultDecl);
 
1924
        }
 
1925
}
 
1926
 
 
1927
template <class B, WorkingCharacterSet W>
 
1928
void ParsingEngine<B, W>::Parse_Entitydecl (){
 
1929
 
 
1930
        int name_start;
 
1931
        int quot_start;
 
1932
        int lgth;
 
1933
        int old_abspos;
 
1934
        char * s;
 
1935
 
 
1936
        Advance(6); /* Skip "ENTITY. */
 
1937
        requireWS();
 
1938
 
 
1939
        if (AtChar<B::Base,'%'>(cur())){
 
1940
                Advance(1);
 
1941
                requireWS();
 
1942
 
 
1943
                int nameID = Parse_Name();
 
1944
                PEntity_info * this_info = new PEntity_info;
 
1945
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
 
1946
                if(entityID==0){
 
1947
                        Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalPEntityCount);
 
1948
                        entityID = Parser_Interface<W>::model_info->globalPEntityCount;
 
1949
                        this_info->globalPEntity_id = entityID;
 
1950
                }
 
1951
                else
 
1952
                        printf("Warning: Entity definition already exist!\n");
 
1953
 
 
1954
                requireWS();
 
1955
                if(AtQuote<B::Base>(cur())){
 
1956
                Parse_PEntityValue(this_info);
 
1957
                this_info->is_external = false;
 
1958
        }
 
1959
        else {
 
1960
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
1961
                this_info->is_external = true;
 
1962
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
 
1963
        }
 
1964
        Parser_Interface<W>::model_info->PEntityData.push_back(this_info);
 
1965
        }
 
1966
        else{
 
1967
                int nameID = Parse_Name();
 
1968
 
 
1969
                GEntity_info * this_info = new GEntity_info();
 
1970
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
1971
                if(entityID==0){
 
1972
                        Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalGEntityCount);
 
1973
                        entityID = Parser_Interface<W>::model_info->globalGEntityCount;
 
1974
                        this_info->globalGEntity_id = entityID;
 
1975
                }
 
1976
                else
 
1977
                        printf("Warning: Entity definition already exists!\n");
 
1978
 
 
1979
                requireWS();
 
1980
 
 
1981
                if(AtQuote<B::Base>(cur())){
 
1982
                Parse_GEntityValue(this_info);
 
1983
                this_info->is_external = false;
 
1984
        }
 
1985
        else {
 
1986
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
1987
                this_info->is_external = true;
 
1988
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
 
1989
                        old_abspos = AbsPos();
 
1990
                        ScanTo(NonWS);
 
1991
                if(at_NDATA<B::Base>(cur())){
 
1992
                        if(old_abspos == AbsPos())
 
1993
                                Syntax_Error(NT_EntityDecl);
 
1994
                        else
 
1995
                                Advance(5);
 
1996
                        requireWS();
 
1997
                        name_start = AbsPos();
 
1998
                        int nameID = Parse_Name();
 
1999
                        lgth = AbsPos() - name_start;
 
2000
                                this_info->NDataName = copy_string(GetCodeUnitPtr(name_start),lgth);
 
2001
                }
 
2002
                }
 
2003
        Parser_Interface<W>::model_info->GEntityData.push_back(this_info);
 
2004
        }
 
2005
        ScanTo(NonWS);
 
2006
        if (AtChar<B::Base,'>'>(cur())){
 
2007
                Advance(1);
 
2008
        }
 
2009
        else
 
2010
                Syntax_Error(NT_EntityDecl);
 
2011
}
 
2012
 
 
2013
template <class B, WorkingCharacterSet W>
 
2014
void ParsingEngine<B, W>::Parse_Notationdecl (){
 
2015
 
 
2016
        int old_abspos;
 
2017
        Advance(8); /* Skip "NOTATION. */
 
2018
        requireWS();
 
2019
 
 
2020
        int nameID = Parse_Name();
 
2021
 
 
2022
        int notationID = Parser_Interface<W>::model_info->GlobalNotationTable[nameID];
 
2023
        /* notationID == -1: used but not yet defined; == 0: new, > 0 prev. defined */
 
2024
        if(notationID <= 0){
 
2025
                Parser_Interface<W>::model_info->GlobalNotationTable[nameID]=++(Parser_Interface<W>::model_info->globalNotationCount);
 
2026
                notationID = Parser_Interface<W>::model_info->globalNotationCount;
 
2027
        }
 
2028
        else /*Duplicate notation name!*/
 
2029
                Validity_Error(vErr_NoDuplicateTokens);
 
2030
        Notation_info * this_info = new Notation_info;
 
2031
        ScanTo(NonWS);
 
2032
    Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
2033
        ScanTo(NonWS);
 
2034
        if (AtChar<B::Base,'>'>(cur())) {
 
2035
                Advance(1);
 
2036
        }
 
2037
        else
 
2038
                Syntax_Error(NT_NotationDecl);
 
2039
}
 
2040
 
 
2041
template <class B, WorkingCharacterSet W>
 
2042
void ParsingEngine<B, W>::requireWS(){
 
2043
 
 
2044
    int old_abspos = AbsPos();
 
2045
    ScanTo(NonWS);
 
2046
    if(old_abspos == AbsPos())
 
2047
        Syntax_Error(NT_S);
 
2048
}
 
2049
 
 
2050
template <class B, WorkingCharacterSet W>
 
2051
void ParsingEngine<B, W>::Parse_AttValue(){
 
2052
 
 
2053
        int     quoteCh = cur()[0];
 
2054
        Advance(1); /* Skip " or ' */
 
2055
 
 
2056
        ScanTo(Quote);
 
2057
        while (cur()[0] != quoteCh){
 
2058
                if (at_CharRef_Start<B::Base>(cur())){
 
2059
                        Parse_CharRef();
 
2060
                        ScanTo(Quote);
 
2061
                }
 
2062
                else if (AtChar<B::Base,'&'>(cur())){
 
2063
                        Parse_EntityRef();
 
2064
                        ScanTo(Quote);
 
2065
                }
 
2066
                else if (AtQuote<B::Base>(cur())) {
 
2067
                        Advance(1);
 
2068
                        ScanTo(Quote);
 
2069
                }
 
2070
                else /* if (AtChar<B::Base,'<'>(cur())) */
 
2071
                        WF_Error(wfErr_CleanAttrVals);
 
2072
        }
 
2073
        Advance(1);
 
2074
}
 
2075
 
 
2076
template <class B, WorkingCharacterSet W>
 
2077
void ParsingEngine<B, W>::Parse_GEntityValue(GEntity_info * this_info){
 
2078
 
 
2079
        int     quoteCh = cur()[0];
 
2080
        Advance(1); /* Skip " or ' */
 
2081
        this_info->is_simple = true;
 
2082
        int quot_start = AbsPos();
 
2083
        char * replText;
 
2084
        ScanTo(Quote);
 
2085
        replText = copy_string(GetCodeUnitPtr(quot_start),AbsPos()-quot_start);
 
2086
        while (cur()[0] != quoteCh){
 
2087
                if (at_CharRef_Start<B::Base>(cur())){
 
2088
                        strcat (replText,Replace_CharRef());
 
2089
                        quot_start = AbsPos();
 
2090
                        ScanTo(Quote);
 
2091
                }
 
2092
                else if (AtQuote<B::Base>(cur())) {
 
2093
                        quot_start = AbsPos();
 
2094
                        Advance(1);
 
2095
                        ScanTo(Quote);
 
2096
                }
 
2097
                else if (at_EOF()) {
 
2098
                        Syntax_Error(NT_EntityValue);
 
2099
                }
 
2100
                else { /* '<' or '&' found */
 
2101
                        quot_start = AbsPos();
 
2102
                        Advance(1);
 
2103
                        ScanTo(Quote);
 
2104
                        this_info->is_simple = false;
 
2105
                }
 
2106
                replText = cat_string (replText,(char *)GetCodeUnitPtr(quot_start), strlen(replText), AbsPos()-quot_start);
 
2107
        }
 
2108
        this_info->ReplacementText = replText;
 
2109
        Advance(1);
 
2110
}
 
2111
 
 
2112
template <class B, WorkingCharacterSet W>
 
2113
char * ParsingEngine<B, W>::Replace_EntityRef(bool& is_simple){
 
2114
        Advance(1);
 
2115
        int nameID = Parse_Name();
 
2116
        if (AtChar<B::Base,';'>(cur()))
 
2117
                Advance(1);
 
2118
        else
 
2119
                Syntax_Error(NT_EntityValue);
 
2120
        int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
2121
        if (entityID == 0)
 
2122
                WF_Error(wfErr_wf_entdeclared);
 
2123
        else{
 
2124
                if (Parser_Interface<W>::model_info->GEntityData[entityID-1]->is_simple == false)
 
2125
                        is_simple = false;
 
2126
                return Parser_Interface<W>::model_info->GEntityData[entityID-1]->ReplacementText;
 
2127
        }
 
2128
 
 
2129
}
 
2130
 
 
2131
template <class B, WorkingCharacterSet W>
 
2132
void ParsingEngine<B, W>::Parse_PEntityValue(PEntity_info * this_info){
 
2133
        fprintf(stderr,"parsing of parameter entity value has not been completed yet.\n");
 
2134
        exit(-1);
 
2135
}
 
2136
 
 
2137
template <class B, WorkingCharacterSet W>
 
2138
char * ParsingEngine<B, W>::Replace_CharRef(){
 
2139
        Advance(2);
 
2140
        fprintf(stderr,"Replacement of Character Reference has not been completed yet.\n");
 
2141
        exit(-1);
 
2142
}
 
2143
 
 
2144
template <class B, WorkingCharacterSet W>
 
2145
void ParsingEngine<B, W>::Parse_Prolog(){
 
2146
        ScanTo(NonWS);
 
2147
        int old_pos = AbsPos();
 
2148
        while (!at_DOCTYPE_start<B::Base>(cur())) {
 
2149
                text_or_markup_start = AbsPos();
 
2150
                if (at_Comment_Start<B::Base>(cur()))
 
2151
                        Parse_Comment();
 
2152
                else if (at_PI_Start<B::Base>(cur()))
 
2153
                                Parse_PI();
 
2154
                else{
 
2155
                        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
 
2156
                        return;
 
2157
                }
 
2158
                ScanTo(NonWS);
 
2159
        }
 
2160
        Parse_DocType();
 
2161
        ScanTo(NonWS);
 
2162
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
 
2163
                text_or_markup_start = AbsPos();
 
2164
                if (at_Comment_Start<B::Base>(cur()))
 
2165
                        Parse_Comment();
 
2166
                else
 
2167
                        Parse_PI();
 
2168
                ScanTo(NonWS);
 
2169
        }
 
2170
        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
 
2171
}
 
2172
 
 
2173
template <class B, WorkingCharacterSet W>
 
2174
void ParsingEngine<B, W>::Parse_ExtSubsetDecl() {
 
2175
        ScanTo(NonWS);
 
2176
        int start_pos=AbsPos();
 
2177
        while(!at_EOF()){
 
2178
                if(at_condSect_start<B::Base>(cur())){
 
2179
                        Advance(3);
 
2180
                        ScanTo(NonWS);
 
2181
                        if (at_INCLUDE<B::Base>(cur())){
 
2182
                                Advance(7);
 
2183
                                ScanTo(NonWS);
 
2184
                                if(AtChar<B::Base,'['>(cur())){
 
2185
                                        Advance(1);
 
2186
                                        Parse_ExtSubsetDecl();
 
2187
                                        if(at_CDATA_End<B::Base>(cur()))
 
2188
                                                Advance(3);
 
2189
                                        else Syntax_Error(NT_includeSect);
 
2190
                                }
 
2191
                                else Syntax_Error(NT_includeSect);
 
2192
                        }
 
2193
                        else if (at_IGNORE<B::Base>(cur())){
 
2194
                                Advance(6);
 
2195
                                ScanTo(NonWS);
 
2196
                                if(AtChar<B::Base,'['>(cur())){
 
2197
                                        int section_depth=1;
 
2198
                                        Advance(1);
 
2199
                                        while(!at_EOF()){
 
2200
                                                ScanTextTo(MarkupStart);
 
2201
                                                if(at_condSect_start<B::Base>(cur())){
 
2202
                                                        Advance(3);
 
2203
                                                        section_depth++;
 
2204
                                                }
 
2205
                                                else if(at_CDATA_End<B::Base>(cur())){
 
2206
                                                        Advance(3);
 
2207
                                                        section_depth--;
 
2208
                                                }
 
2209
                                                else
 
2210
                                                        Advance(1);
 
2211
                                                if(section_depth==0) return;
 
2212
                                        }
 
2213
                                        Syntax_Error(NT_ignoreSectContents);
 
2214
                                }
 
2215
                                else Syntax_Error(NT_ignoreSect);
 
2216
                        }
 
2217
                        else Syntax_Error(NT_conditionalSect);
 
2218
                }
 
2219
                else if (AtChar<B::Base,'%'>(cur()))
 
2220
                        Parse_PEReference();
 
2221
                else if (at_PI_Start<B::Base>(cur())) {
 
2222
                        Parse_PI();
 
2223
                }
 
2224
                else if (at_Comment_Start<B::Base>(cur())) {
 
2225
                        Parse_Comment();
 
2226
                }
 
2227
                else if (AtChar<B::Base,'<'>(cur())){
 
2228
                        Advance(1);
 
2229
 
 
2230
                        if(AtChar<B::Base,'!'>(cur())){
 
2231
                                Advance(1);
 
2232
                                if(at_ELEMENT<B::Base>(cur()))
 
2233
                                        Parse_Elementdecl();
 
2234
                                else if(at_ATTLIST<B::Base>(cur()))
 
2235
                                        Parse_AttlistDecl();
 
2236
                                else if(at_ENTITY<B::Base>(cur()))
 
2237
                                        Parse_Entitydecl();
 
2238
                                else if(at_NOTATION<B::Base>(cur()))
 
2239
                                        Parse_Notationdecl();
 
2240
                                else{
 
2241
                                        Syntax_Error(NT_markupdecl);
 
2242
                                }
 
2243
                        }
 
2244
                        else
 
2245
                                Syntax_Error(NT_markupdecl);
 
2246
                }
 
2247
                else
 
2248
                        Syntax_Error(NT_extSubsetDecl);
 
2249
                ScanTo(NonWS);
 
2250
        }
 
2251
        ExtSubsetDecl_action(GetCodeUnitPtr(start_pos), LengthFrom(start_pos));
 
2252
}
 
2253
 
 
2254
/* Parse a valid start or empty element tag. */
 
2255
template <class B, WorkingCharacterSet W>
 
2256
inline int ParsingEngine<B, W>::Parse_ValidStartTag (bool& is_emptyStartTag){
 
2257
        int att_name_start;
 
2258
        int att_val_start;
 
2259
        int att_name_end, att_val_end;
 
2260
        unsigned char quoteCh;
 
2261
        Advance(1);
 
2262
 
 
2263
        int nameID = Parse_Name();
 
2264
        int elemID = Parser_Interface<W>::model_info->GlobalElementTable[nameID];
 
2265
        if(elemID==0)
 
2266
                        Validity_Error(vErr_elementvalid);
 
2267
 
 
2268
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
2269
        /* The following test optimizes the most common case of a
 
2270
        start tag with no attributes.  */
 
2271
        if (AtChar<B::Base,'>'>(cur())) {
 
2272
                Advance(1);
 
2273
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2274
        }
 
2275
        else {
 
2276
                ScanTo(NonWS);
 
2277
                if (AtChar<B::Base,'>'>(cur())) {
 
2278
                        Advance(1);
 
2279
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2280
                }
 
2281
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2282
                        Advance(2);
 
2283
                        is_emptyStartTag = true;
 
2284
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2285
                }
 
2286
                else do {
 
2287
                        /* Must be an attribute-value pair or error. */
 
2288
                        att_name_start = AbsPos();
 
2289
                        int att_nameID = Parse_Name();
 
2290
                        #if (not defined(OMISSION)) or (OMISSION != ATTRIBUTE_UNIQUENESS)
 
2291
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
2292
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
2293
                        else {
 
2294
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
2295
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
2296
                                        break;
 
2297
                                }
 
2298
                        }
 
2299
                        LastAttOccurrence[attID] = att_name_start;
 
2300
                        #endif
 
2301
                        /* The following optimized tests handle the frequently occurring
 
2302
                        case that there are no blanks on either side of the equals sign.
 
2303
                        In many cases, the very first test handles 100% of actual
 
2304
                        attribute-value pairs encountered. */
 
2305
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1);
 
2306
                        else {
 
2307
                                ScanTo(NonWS);
 
2308
                                if (!AtChar<B::Base,'='>(cur())) {
 
2309
                                        Syntax_Error(NT_STag);
 
2310
                                        break;
 
2311
                                }
 
2312
                                Advance(1);
 
2313
                                ScanTo(NonWS);
 
2314
                                if (!AtQuote<B::Base>(cur())) {
 
2315
                                        Syntax_Error(NT_STag);
 
2316
                                        break;
 
2317
                                }
 
2318
                        }
 
2319
                        att_val_start = AbsPos()+1;
 
2320
                        Parse_AttValue();
 
2321
                        att_val_end = AbsPos()-1;
 
2322
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
2323
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
2324
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
2325
                        }
 
2326
                        else {
 
2327
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
2328
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
2329
                        }
 
2330
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
2331
                        if (AtChar<B::Base,'>'>(cur())) {
 
2332
                                Advance(1);
 
2333
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2334
                                break;
 
2335
                        }
 
2336
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2337
                                Advance(2);
 
2338
                                is_emptyStartTag = true;
 
2339
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2340
                                break;
 
2341
                        }
 
2342
                        ScanTo(NonWS);
 
2343
                        if (AtChar<B::Base,'>'>(cur())) {
 
2344
                                Advance(1);
 
2345
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2346
                                break;
 
2347
                        }
 
2348
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2349
                                Advance(2);
 
2350
                                is_emptyStartTag = true;
 
2351
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2352
                                break;
 
2353
                        }
 
2354
                        else if (AbsPos() == att_val_end + 1) {
 
2355
                                /* No WS following att value */
 
2356
                                Syntax_Error(NT_STag);
 
2357
                                break;
 
2358
                        }
 
2359
                } while (1);
 
2360
        }
 
2361
        return nameID;
 
2362
}
 
2363
 
 
2364
template <class B, WorkingCharacterSet W>
 
2365
int ParsingEngine<B, W>::Parse_ValidElement() {
 
2366
        bool is_emptyStartTag = false;
 
2367
        int nameID = Parse_ValidStartTag(is_emptyStartTag);
 
2368
#ifdef DEBUG
 
2369
        printf("Parse_ValidElement: nameID = %d, name = %s, is_emptyStartTag=%i\n",nameID, Parser_Interface<W>::model_info->symbol_table->Get_UTF8_name(nameID), is_emptyStartTag);
 
2370
#endif
 
2371
        ContentModel * cm = Parser_Interface<W>::model_info->ContentModelData[nameID];
 
2372
        switch (cm->cm_type) {
 
2373
                case cm_Empty:
 
2374
                        if (!is_emptyStartTag) {
 
2375
                                if (at_EndTag_Start<B::Base>(cur())) {
 
2376
                                        Parse_WF_EndTag(nameID);
 
2377
                                }
 
2378
                                else {
 
2379
                                        Validity_Error(vErr_elementvalid);
 
2380
                                }
 
2381
                        }
 
2382
                        break;
 
2383
                case cm_Any:
 
2384
                        if (!is_emptyStartTag) {
 
2385
                                Parse_AnyContent();
 
2386
                                Parse_WF_EndTag(nameID);
 
2387
                        }
 
2388
                        break;
 
2389
                case cm_Mixed:
 
2390
                        if (!is_emptyStartTag) {
 
2391
                                Parse_MixedContent(((CM_Mixed *) cm)->elements);
 
2392
                                Parse_WF_EndTag(nameID);
 
2393
                        }
 
2394
                        break;
 
2395
                case cm_RegExp:
 
2396
                        CM_RegExp * cre = (CM_RegExp *) cm;
 
2397
                        int content_state = 0;
 
2398
                        if (!is_emptyStartTag) {
 
2399
                                Parse_ValidContent(cre, content_state);
 
2400
                                #ifdef DEBUG
 
2401
                                printf("Final content_state = %i, nameID = %i\n", content_state, nameID);
 
2402
                                #endif
 
2403
                                Parse_WF_EndTag(nameID);
 
2404
                        }
 
2405
                        if (cre->transition_map[content_state][0]==0) {
 
2406
                                Validity_Error(vErr_elementvalid);
 
2407
                        }
 
2408
        }
 
2409
        return nameID;
 
2410
}
 
2411
 
 
2412
template <class B, WorkingCharacterSet W>
 
2413
void ParsingEngine<B, W>::Parse_ValidContent(CM_RegExp * cre, int & cur_state) {
 
2414
        do {
 
2415
                ScanTo(NonWS);
 
2416
                /* If non-null report WS  WS_action()? */
 
2417
                text_or_markup_start = AbsPos();
 
2418
                if (at_EndTag_Start<B::Base>(cur())) {
 
2419
                        break;
 
2420
                }
 
2421
                else if (at_ElementTag_Start<B::Base>(cur())) {
 
2422
                        int nameID = Parse_ValidElement();
 
2423
#ifdef DEBUG
 
2424
                        printf("Content model state transition %i", cur_state);
 
2425
#endif
 
2426
                        cur_state = cre->transition_map[cur_state][nameID];
 
2427
#ifdef DEBUG
 
2428
                        printf("-> %i\n", cur_state);
 
2429
#endif
 
2430
                }
 
2431
                else if (at_Comment_Start<B::Base>(cur())) {
 
2432
                        Parse_Comment();
 
2433
                }
 
2434
                else if (at_PI_Start<B::Base>(cur())) {
 
2435
                        Parse_PI();
 
2436
                }
 
2437
                else if (AtChar<B::Base,'&'>(cur())) {
 
2438
                        Parse_ValidEntityRef(cre, cur_state);
 
2439
#ifdef DEBUG
 
2440
                        printf("EntityRef complete, cur_state = %i\n", cur_state);
 
2441
#endif
 
2442
 
 
2443
                }
 
2444
                else if (at_EOF()) {
 
2445
                        break;
 
2446
                }
 
2447
                else if (AtChar<B::Base,'<'>(cur())) {
 
2448
                        Syntax_Error(NT_markupdecl);
 
2449
                }
 
2450
                else {
 
2451
                        Validity_Error(vErr_elementvalid);
 
2452
                }
 
2453
        } while(1);
 
2454
}
 
2455
 
 
2456
 
 
2457
template <class B, WorkingCharacterSet W>
 
2458
void ParsingEngine<B, W>::Parse_AnyContent() {
 
2459
        do {
 
2460
                text_or_markup_start = AbsPos();
 
2461
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
2462
                if (at_ElementTag_Start<B::Base>(cur())) {
 
2463
                        text_if_nonnull_action(false);
 
2464
                        int nameID = Parse_ValidElement();
 
2465
                }
 
2466
                else if (at_EndTag_Start<B::Base>(cur())) {
 
2467
                        text_if_nonnull_action(false);
 
2468
                        return;
 
2469
                }
 
2470
                else if (at_Comment_Start<B::Base>(cur())) {
 
2471
                        text_if_nonnull_action(false);
 
2472
                        Parse_Comment();
 
2473
                }
 
2474
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2475
                        text_if_nonnull_action(true);
 
2476
                        Parse_CharRef();
 
2477
                }
 
2478
                else if (AtChar<B::Base,'&'>(cur())) {
 
2479
                        text_if_nonnull_action(true);
 
2480
                        Parse_EntityRef_inAnyContent();
 
2481
                }
 
2482
                else if (at_CDATA_Start<B::Base>(cur())) {
 
2483
                        text_if_nonnull_action(true);
 
2484
                        Parse_CDATA();
 
2485
                }
 
2486
                else if (at_PI_Start<B::Base>(cur())) {
 
2487
                        text_if_nonnull_action(false);
 
2488
                        Parse_PI();
 
2489
                }
 
2490
                else if (at_CDATA_End<B::Base>(cur())) {
 
2491
                        text_if_nonnull_action(true);
 
2492
                        Advance(3);
 
2493
                        Syntax_Error(NT_CharData);
 
2494
                }
 
2495
                else if (at_EOF()) {
 
2496
                        text_if_nonnull_action(false);
 
2497
                        return;
 
2498
                }
 
2499
                else if (AtChar<B::Base,'<'>(cur())) {
 
2500
                        Syntax_Error(NT_markupdecl);
 
2501
                }
 
2502
                else {
 
2503
                        Advance(1);
 
2504
                        continue;
 
2505
                }
 
2506
        } while (1);
 
2507
}
 
2508
template <class B, WorkingCharacterSet W>
 
2509
void ParsingEngine<B, W>::Parse_MixedContent(symbol_set_t elems) {
 
2510
        do {
 
2511
                text_or_markup_start = AbsPos();
 
2512
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
2513
/*              if (AtChar<B::Base,'<'>(cur())) {
 
2514
                        text_if_nonnull_action();
 
2515
                        Parse_Markup<B, W>();
 
2516
                }*/
 
2517
                if (at_ElementTag_Start<B::Base>(cur())) {
 
2518
                        text_if_nonnull_action(false);
 
2519
                        int nameID = Parse_ValidElement();
 
2520
                        if (elems[nameID] == 0) {
 
2521
                                Validity_Error(vErr_elementvalid);
 
2522
                        }
 
2523
                }
 
2524
                else if (at_EndTag_Start<B::Base>(cur())) {
 
2525
                        text_if_nonnull_action(false);
 
2526
                        return;
 
2527
                }
 
2528
                else if (at_Comment_Start<B::Base>(cur())) {
 
2529
                        text_if_nonnull_action(false);
 
2530
                        Parse_Comment();
 
2531
                }
 
2532
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2533
                        text_if_nonnull_action(true);
 
2534
                        Parse_CharRef();
 
2535
                }
 
2536
                else if (AtChar<B::Base,'&'>(cur())) {
 
2537
                        text_if_nonnull_action(true);
 
2538
                        Parse_EntityRef_inMixed(elems);
 
2539
                }
 
2540
                else if (at_CDATA_Start<B::Base>(cur())) {
 
2541
                        text_if_nonnull_action(true);
 
2542
                        Parse_CDATA();
 
2543
                }
 
2544
                else if (at_PI_Start<B::Base>(cur())) {
 
2545
                        text_if_nonnull_action(false);
 
2546
                        Parse_PI();
 
2547
                }
 
2548
                else if (at_CDATA_End<B::Base>(cur())) {
 
2549
                        text_if_nonnull_action(true);
 
2550
                        Advance(3);
 
2551
                        Syntax_Error(NT_CharData);
 
2552
                }
 
2553
                else if (at_EOF()) {
 
2554
                        text_if_nonnull_action(false);
 
2555
                        return;
 
2556
                }
 
2557
                else if (AtChar<B::Base,'<'>(cur())) {
 
2558
                        Syntax_Error(NT_markupdecl);
 
2559
                }
 
2560
                else {
 
2561
                        Advance(1);
 
2562
                        continue;
 
2563
                }
 
2564
        } while (1);
 
2565
}
 
2566
 
 
2567
 
 
2568
template <class B, WorkingCharacterSet W>
 
2569
int ParsingEngine<B, W>::Parse_Name() {
 
2570
        int name_pos = AbsPos();
 
2571
        ScanTo(NameFollow);
 
2572
        int lgth = AbsPos()-name_pos;
 
2573
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Name(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
 
2574
        if (nameID != 0) return nameID;
 
2575
        else {
 
2576
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2577
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2578
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2579
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved();
 
2580
        }
 
2581
}
 
2582
 
 
2583
// template <>
 
2584
// int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Name() {
 
2585
//      int name_pos = AbsPos();
 
2586
//      ScanTo(NameFollow);
 
2587
//      int lgth = AbsPos()-name_pos;
 
2588
// //   int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
 
2589
// //   if (nameID != 0) return nameID;
 
2590
// //   else {
 
2591
//              int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2592
//              char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2593
//              byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2594
//              return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved();
 
2595
// //   }
 
2596
// }
 
2597
 
 
2598
// template <WorkingCharacterSet W>
 
2599
// inline int ParsingEngine<UTF8_Buffer, W>::Parse_Name() {
 
2600
//      int name_pos = AbsPos();
 
2601
//      ScanTo(NameFollow);
 
2602
//      int lgth = AbsPos()-name_pos;
 
2603
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2604
// }
 
2605
 
 
2606
template <>
 
2607
inline int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Name() {
 
2608
        int name_pos = AbsPos();
 
2609
        ScanTo(NameFollow);
 
2610
        int lgth = AbsPos()-name_pos;
 
2611
        return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2612
}
 
2613
 
 
2614
template <class B, WorkingCharacterSet W>
 
2615
int ParsingEngine<B, W>::Parse_Nmtoken() {
 
2616
        int name_pos = AbsPos();
 
2617
        ScanTo(NameFollow);
 
2618
        int lgth = AbsPos()-name_pos;
 
2619
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Nmtoken(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
 
2620
        if (nameID != 0) return nameID;
 
2621
        else {
 
2622
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2623
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2624
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2625
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
 
2626
        }
 
2627
}
 
2628
 
 
2629
/*template <>
 
2630
int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Nmtoken() {
 
2631
        int name_pos = AbsPos();
 
2632
        ScanTo(NameFollow);
 
2633
        int lgth = AbsPos()-name_pos;
 
2634
//      int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
 
2635
//      if (nameID != 0) return nameID;
 
2636
//      else {
 
2637
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2638
                char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2639
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2640
                return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
 
2641
//      }
 
2642
}*/
 
2643
// template <>
 
2644
// int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Nmtoken() {
 
2645
//      int name_pos = AbsPos();
 
2646
//      ScanTo(NameFollow);
 
2647
//      int lgth = AbsPos()-name_pos;
 
2648
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Nmtoken(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2649
// }
 
2650
 
 
2651
template <class B, WorkingCharacterSet W>
 
2652
void ParsingEngine<B, W>::Parse_DocumentContent() {
 
2653
        Parser_Interface<W>::DocumentStart_action();
 
2654
#if (VALIDATION_MODE == ON)
 
2655
        int cur_state = 0;
 
2656
        Parse_ValidContent(Parser_Interface<W>::model_info->rootModel, cur_state);
 
2657
        if (Parser_Interface<W>::model_info->rootModel->transition_map[cur_state][0]==0) {
 
2658
                Validity_Error(vErr_elementvalid);
 
2659
        }
 
2660
#endif
 
2661
#if (VALIDATION_MODE == OFF)
 
2662
        Parse_WF_Element();
 
2663
        ScanTo(NonWS);
 
2664
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
 
2665
                if (at_Comment_Start<B::Base>(cur()))
 
2666
                        Parse_Comment();
 
2667
                else
 
2668
                        Parse_PI();
 
2669
                ScanTo(NonWS);
 
2670
        }
 
2671
        if (!at_EOF()) {
 
2672
                Syntax_Error(NT_document);
 
2673
        }
 
2674
#endif
 
2675
        Parser_Interface<W>::DocumentEnd_action();
 
2676
}
 
2677
 
 
2678
#ifdef MARKUP_PASS_CONTROL
 
2679
// Test routine as an alternative to MarkupPass.
 
2680
template <class B, WorkingCharacterSet W>
 
2681
void ParsingEngine<B, W>::ParseContent() {
 
2682
        int start_code = 0;
 
2683
        int end_code = 0;
 
2684
        int charref_code = 0;
 
2685
        int general_ref_code = 0;
 
2686
        DocumentStart_action();
 
2687
        bool is_emptyStartTag = false;
 
2688
        do {
 
2689
                text_or_markup_start = AbsPos();
 
2690
                ScanTo(MarkupStart); /* '<', '&', or ']' for 0b11']]>' test */
 
2691
/*              if (AtChar<B::Base,'<'>(cur())) {
 
2692
                        text_if_nonnull_action();
 
2693
                        Parse_Markup<B, W>();
 
2694
                }*/
 
2695
                if (at_EndTag_Start<B::Base>(cur())) {
 
2696
                        end_code |= AbsPos();
 
2697
                }
 
2698
                else if (AtChar<B::Base,'<'>(cur())) {
 
2699
                        start_code += AbsPos();
 
2700
                }
 
2701
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2702
                        charref_code += 1;
 
2703
                }
 
2704
                else  if (AtChar<B::Base,'&'>(cur())) {
 
2705
                        general_ref_code += 1;
 
2706
                }
 
2707
                else if (at_EOF()) break;
 
2708
                Advance(1);
 
2709
        } while (1);
 
2710
        printf("Start_code: %i\n", start_code);
 
2711
        printf("End_code: %i\n", end_code);
 
2712
        printf("general_ref_code: %i\n", general_ref_code);
 
2713
        printf("charref_code: %i\n", charref_code);
 
2714
        DocumentEnd_action();
 
2715
}
 
2716
#endif
 
2717
 
 
2718
#ifdef MARKUP_SORTING
 
2719
// Little endian codes for [&#/] stream.
 
2720
enum MarkupSortCodes {
 
2721
  StartTagTwoBitCode = 0,
 
2722
  EndTagTwoBitCode = 2,
 
2723
  GeneralRefCode = 1,
 
2724
  CharRefCode = 3
 
2725
};
 
2726
 
 
2727
 
 
2728
static inline int GetBitPair(SIMD_type * stream, int bit_posn) {
 
2729
        return bitstream_segment_from(stream, bit_posn) & 3;
 
2730
}
 
2731
 
 
2732
template <class B, WorkingCharacterSet W>
 
2733
void ParsingEngine<B, W>::ParseContent() {
 
2734
/*vector<int> MarkupPositions[4];*/
 
2735
int MarkupPositions[4][BUFFER_SIZE];
 
2736
int MarkupCounts[4];
 
2737
        int start_code = 0;
 
2738
        int end_code = 0;
 
2739
        int charref_code = 0;
 
2740
        int general_ref_code = 0;
 
2741
 
 
2742
        DocumentStart_action();
 
2743
        bool is_emptyStartTag = false;
 
2744
                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
 
2745
                text_or_markup_start = AbsPos();
 
2746
        do {
 
2747
                unsigned long segment = bitstream_segment_from(buf->item_stream[MarkupStart], buffer_rel_pos);
 
2748
//printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
 
2749
                if (segment != 0) {
 
2750
                        buffer_rel_pos += cfzl(segment);
 
2751
                text_or_markup_start = AbsPos();
 
2752
                        int markup_code = GetBitPair(buf->item_stream[AmpHashSlash], buffer_rel_pos);
 
2753
                        MarkupPositions[markup_code][MarkupCounts[markup_code]] = AbsPos();
 
2754
                        MarkupCounts[markup_code]++;
 
2755
                        Advance(1);
 
2756
                }
 
2757
                else {
 
2758
                        buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
 
2759
// printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
 
2760
 
 
2761
                        if (buffer_rel_pos >= buffer_limit_pos) {
 
2762
/*                              for (int i = 0; i < MarkupCounts[StartTagTwoBitCode]; i++) {
 
2763
                                        start_code += MarkupPositions[StartTagTwoBitCode][i];
 
2764
                                }
 
2765
                                for (int i = 0; i < MarkupCounts[EndTagTwoBitCode]; i++) {
 
2766
                                        end_code |= MarkupPositions[EndTagTwoBitCode][i];
 
2767
                                }
 
2768
                                for (int i = 0; i < MarkupCounts[GeneralRefCode]; i++) {
 
2769
                                        general_ref_code += 1;
 
2770
                                }
 
2771
                                for (int i = 0; i < MarkupCounts[CharRefCode]; i++) {
 
2772
                                        charref_code += 1;
 
2773
                                }*/
 
2774
/*      printf("Start_code: %i\n", start_code);
 
2775
        printf("End_code: %i\n", end_code);
 
2776
        printf("general_ref_code: %i\n", general_ref_code);
 
2777
        printf("charref_code: %i\n", charref_code);*/
 
2778
                                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
 
2779
                                if (buffer_rel_pos >= BUFFER_SIZE) {
 
2780
                                        AdjustBufferEndForIncompleteSequences();
 
2781
                                        Parser_Interface<W>::FinalizeBuffer_action();
 
2782
                                        AdvanceBuffers();
 
2783
                                }
 
2784
                                else break;
 
2785
                        }
 
2786
 
 
2787
                }
 
2788
 
 
2789
        } while (1);
 
2790
/*      vector<int>::iterator i;
 
2791
        for (i = MarkupPositions[StartTagTwoBitCode].begin(); i != MarkupPositions[StartTagTwoBitCode].end(); i++) {
 
2792
                start_code += *i;
 
2793
        }
 
2794
        for (i = MarkupPositions[EndTagTwoBitCode].begin(); i != MarkupPositions[EndTagTwoBitCode].end(); i++) {
 
2795
                end_code |= *i;
 
2796
        }
 
2797
        for (i = MarkupPositions[GeneralRefCode].begin(); i != MarkupPositions[GeneralRefCode].end(); i++) {
 
2798
                general_ref_code += 1;
 
2799
        }
 
2800
        for (i = MarkupPositions[CharRefCode].begin(); i != MarkupPositions[CharRefCode].end(); i++) {
 
2801
                charref_code += 1;
 
2802
        }*/
 
2803
        printf("Start_code: %i\n", start_code);
 
2804
        printf("End_code: %i\n", end_code);
 
2805
        printf("general_ref_code: %i\n", general_ref_code);
 
2806
        printf("charref_code: %i\n", charref_code);
 
2807
        DocumentEnd_action();
 
2808
}
 
2809
 
 
2810
#endif
 
2811
 
 
2812
 
 
2813
 
 
2814