/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090211/src/engine.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-02-16 09:27:17 UTC
  • Revision ID: csa@dside.dyndns.org-20090216092717-wipyvaaw2srxhgns
Initial import

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*  engine.c - Parabix XML parsing engine.
 
2
    Copyright (c) 2007, 2008, Robert D. Cameron and Dan Lin.
 
3
    Licensed to the public under the Open Software License 3.0.
 
4
    Licensed to International Characters, Inc., under the Academic
 
5
    Free License 3.0.
 
6
*/
 
7
 
 
8
#include "engine.h"
 
9
#include "byteplex.h"
 
10
#include "xmldecl.h"
 
11
#include "bytelex.h"
 
12
#include "bitlex.h"
 
13
#include "contentmodel.h"
 
14
#include "contentmodel.c"
 
15
#include "xml_error.h"
 
16
 
 
17
#include <assert.h>
 
18
#include <stdlib.h>
 
19
#include <errno.h>
 
20
#include <string.h>
 
21
#include <string>
 
22
#include <iostream>
 
23
using namespace std;
 
24
        
 
25
inline char * copy_string (unsigned char * s, int lgth){                
 
26
        char * d = new char[lgth+1];
 
27
        memcpy(d, (char *)s,lgth); 
 
28
        d[lgth] = '\0'; 
 
29
        return d;
 
30
}
 
31
 
 
32
inline char * cat_string (char * s1, char * s2, int lgth1, int lgth2){
 
33
        char * s = new char[lgth1 + lgth2 + 1];
 
34
        memcpy(s, s1,lgth1);
 
35
        memcpy(&s[lgth1],s2,lgth2);
 
36
        s[lgth1 + lgth2] = '\0';
 
37
        return s;
 
38
}
 
39
        
 
40
        
 
41
template <WorkingCharacterSet W>        
 
42
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename) {
 
43
        
 
44
        int chars_read;
 
45
        unsigned char signature[4];
 
46
        FILE * infile;
 
47
        infile = fopen(filename, "rb");
 
48
        if (!infile) {
 
49
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
 
50
                exit(-1);
 
51
        }
 
52
        fread(signature,1,4,infile);
 
53
        Entity_Info * e = new Entity_Info;
 
54
        Model_Info * m = new Model_Info;
 
55
        e->AnalyzeSignature(signature);
 
56
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
 
57
        b->InitializeBuffer(signature,4);
 
58
        b->DoByteplex();
 
59
        b->PreparePseudoASCII_Stream();
 
60
        
 
61
        if (e->code_unit_base == ASCII) {
 
62
                XML_Decl_Parser<ASCII> decl_parser(b);
 
63
                decl_parser.ReadXMLInfo(*e);
 
64
                if (e->code_unit_size == SingleByte) {
 
65
                        if (!(e->has_encoding_decl) || at_UTF_8(e->encoding))
 
66
                                return new ParsingEngine< UTF8_Buffer, W>(e, m, b, false);              
 
67
                        else return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
 
68
                }
 
69
                else if (e->code_unit_size == DoubleByte) {
 
70
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
 
71
                }
 
72
                else if (e->code_unit_size == QuadByte) {
 
73
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
 
74
                }
 
75
        }
 
76
        else /* if (e->code_unit_base == EBCDIC) */ {
 
77
                XML_Decl_Parser<EBCDIC> decl_parser(b);
 
78
                decl_parser.ReadXMLInfo(*e);
 
79
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
 
80
        }       
 
81
}
 
82
 
 
83
template <WorkingCharacterSet W>
 
84
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename, Model_Info * m) {
 
85
        
 
86
        int chars_read;
 
87
        unsigned char signature[4];
 
88
        FILE * infile;
 
89
        infile = fopen(filename, "rb");
 
90
        if (!infile) {
 
91
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
 
92
                exit(-1);
 
93
        }
 
94
        fread(signature,1,4,infile);
 
95
        Entity_Info * e = new Entity_Info;
 
96
        e->AnalyzeSignature(signature);
 
97
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
 
98
        b->InitializeBuffer(signature,4);
 
99
        b->DoByteplex();
 
100
        b->PreparePseudoASCII_Stream();
 
101
        if (e->code_unit_base == ASCII) {
 
102
                XML_Decl_Parser<ASCII> decl_parser(b);
 
103
                decl_parser.ReadXMLInfo(*e);
 
104
                if (e->code_unit_size == SingleByte) {
 
105
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, true);
 
106
                }
 
107
                else if (e->code_unit_size == DoubleByte) {
 
108
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, true);
 
109
                }
 
110
                else if (e->code_unit_size == QuadByte) {
 
111
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, true);
 
112
                }
 
113
        }
 
114
        else /* if (e->code_unit_base == EBCDIC) */ {
 
115
                XML_Decl_Parser<EBCDIC> decl_parser(b);
 
116
                decl_parser.ReadXMLInfo(*e);
 
117
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, true);
 
118
        }       
 
119
}
 
120
 
 
121
template <WorkingCharacterSet W>
 
122
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * byte_buffer, int byte_count, Entity_Info * e1, Model_Info * m){
 
123
        Entity_Info * e = new Entity_Info;
 
124
        e->BOM_units = 0;
 
125
        e->code_unit_base=e1->code_unit_base;
 
126
        e->code_unit_size=e1->code_unit_size;
 
127
        e->version=e1->version;
 
128
        e->encoding=e1->encoding;
 
129
        e->content_start = 0;
 
130
        Byteplex * b = Byteplex::ByteplexFactory(e, (unsigned char *) byte_buffer, byte_count);
 
131
        b->DoByteplex();
 
132
        b->PreparePseudoASCII_Stream();
 
133
        if (e->code_unit_base == ASCII) {
 
134
                XML_Decl_Parser<ASCII> decl_parser(b);
 
135
                decl_parser.ReadXMLInfo(*e);
 
136
                if (e->code_unit_size == SingleByte) {
 
137
                    puts("ASCII");
 
138
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
 
139
                }
 
140
                else if (e->code_unit_size == DoubleByte) {
 
141
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
 
142
                }
 
143
                else if (e->code_unit_size == QuadByte) {
 
144
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
 
145
                }
 
146
        }
 
147
        else /* if (e->code_unit_base == EBCDIC) */ {
 
148
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
 
149
        }       
 
150
}
 
151
 
 
152
template <WorkingCharacterSet W>
 
153
Parser_Interface<W>::~Parser_Interface() {
 
154
}
 
155
 
 
156
 
 
157
template <WorkingCharacterSet W>
 
158
bool Parser_Interface<W>::has_ByteOrderMark() {
 
159
        return entity_Info->BOM_units > 0;
 
160
}
 
161
 
 
162
template <WorkingCharacterSet W>
 
163
XML_version Parser_Interface<W>::get_version() {
 
164
        return entity_Info->version;
 
165
}
 
166
 
 
167
template <WorkingCharacterSet W>
 
168
XML_standalone Parser_Interface<W>::standalone_status() {
 
169
        return entity_Info->standalone;
 
170
}
 
171
 
 
172
template <WorkingCharacterSet W>
 
173
bool Parser_Interface<W>::has_EncodingDecl() {
 
174
        return entity_Info->has_encoding_decl;
 
175
}
 
176
 
 
177
template <WorkingCharacterSet W>
 
178
unsigned char * Parser_Interface<W>::get_Encoding() {
 
179
        return entity_Info->encoding;
 
180
}
 
181
 
 
182
template <class B, WorkingCharacterSet W>
 
183
inline unsigned char * ParsingEngine<B, W>::GetCodeUnitPtr(int pos) {
 
184
        int rel_pos = pos - buffer_base_pos;
 
185
        return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) B::Size];
 
186
}
 
187
 
 
188
template <>
 
189
inline unsigned char * ParsingEngine<UTF8_Buffer, UTF_8>::GetCodeUnitPtr(int pos) {
 
190
        int rel_pos = pos - buffer_base_pos;
 
191
        return &((unsigned char *) (x8data))[rel_pos];
 
192
}
 
193
 
 
194
 
 
195
 
 
196
 
 
197
template <class B, WorkingCharacterSet W>
 
198
ParsingEngine<B, W>::ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external) : Parser_Interface<W> () {
 
199
        Parser_Interface<W>::entity_Info = e;
 
200
        Parser_Interface<W>::model_info = m;
 
201
        byteplex = b;
 
202
 
 
203
//      m->symbol_table = new Symbol_Table();
 
204
//      m->SimpleEntity("lt", "<"); 
 
205
//      m->SimpleEntity("gt", ">"); 
 
206
//      m->SimpleEntity("amp", "&"); 
 
207
//      m->SimpleEntity("quot", "\""); 
 
208
//      m->SimpleEntity("apos", "'");   
 
209
        m->symbol_table->version = e->version;
 
210
 
 
211
        StrictWellFormedness=false;
 
212
        LastAttOccurrence.assign(m->globalAttributeCount+1, 0);
 
213
        
 
214
        
 
215
        bitplex = new Bitplex;
 
216
        buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
 
217
 
 
218
  /* Install sentinels for every lexical item stream*/
 
219
#ifdef TEMPLATED_SIMD_LIB
 
220
        BitBlock sentinel_value = simd<1>::constant<1>();
 
221
#endif
 
222
#ifndef TEMPLATED_SIMD_LIB
 
223
        BitBlock sentinel_value = simd_const_1(1);
 
224
#endif
 
225
 
 
226
#ifdef OPTIMIZE_SHORT_SCAN
 
227
        sentinel_value = sisd_sfli(sentinel_value, 8*sizeof(unsigned long));
 
228
#endif
 
229
 
 
230
        for (int j = minLexicalItem; j < LexicalItemCount; j++) {
 
231
                buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
 
232
        }
 
233
 
 
234
        buffer_base_pos = 0;
 
235
        buffer_rel_pos = e->content_start;
 
236
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
237
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
238
        x8data = byteplex->x8data;
 
239
        lexer = Lexer<B::Base>::LexerFactory(e, buf);
 
240
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
241
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
242
}
 
243
 
 
244
template <class B, WorkingCharacterSet W>
 
245
ParsingEngine<B, W>::~ParsingEngine() {
 
246
  // How do we do this?  Parser_Interface<W>::model_info->~Model_Info();
 
247
  Parser_Interface<W>::entity_Info->~Entity_Info();
 
248
  byteplex->~Byteplex();
 
249
  bitplex->~Bitplex();
 
250
  simd_delete((SIMD_type *) buf);
 
251
  lexer->~Lexer_Interface();
 
252
}
 
253
 
 
254
template <class B, WorkingCharacterSet W>
 
255
void ParsingEngine<B, W>::AdvanceBuffers(){
 
256
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
257
        code_clocker->cc_start_interval();
 
258
#endif
 
259
 
 
260
        int advance_amt = text_or_markup_start - buffer_base_pos;
 
261
        advance_amt &= -PACKSIZE; // maintain alignment
 
262
        byteplex->AdvanceInputBuffer(advance_amt);
 
263
        buffer_base_pos += advance_amt;
 
264
        buffer_rel_pos -= advance_amt;
 
265
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
266
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
267
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
268
        code_clocker->cc_start_interval();
 
269
#endif
 
270
        byteplex->DoByteplex();
 
271
        byteplex->PreparePseudoASCII_Stream();
 
272
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
273
        code_clocker->cc_end_interval(buffer_limit_pos);
 
274
#endif
 
275
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
276
        code_clocker->cc_start_interval();
 
277
#endif
 
278
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
279
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
280
        code_clocker->cc_end_interval(buffer_limit_pos);
 
281
#endif
 
282
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
283
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
284
        code_clocker->cc_end_interval(buffer_limit_pos);
 
285
#endif
 
286
 
 
287
}
 
288
 
 
289
template <>
 
290
void ParsingEngine<U16_Buffer, UTF_16>::AdvanceBuffers(){
 
291
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
292
        code_clocker->cc_start_interval();
 
293
#endif
 
294
 
 
295
        int advance_amt = text_or_markup_start - buffer_base_pos;
 
296
        advance_amt &= -PACKSIZE; // maintain alignment
 
297
        byteplex->AdvanceInputBuffer(advance_amt);
 
298
        buffer_base_pos += advance_amt;
 
299
        buffer_rel_pos -= advance_amt;
 
300
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
 
301
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
 
302
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
303
        code_clocker->cc_start_interval();
 
304
#endif
 
305
        byteplex->DoByteplex();
 
306
        if (at_UTF_16(Parser_Interface<UTF_16>::entity_Info->encoding)) ((U16_Buffer *) byteplex)->Validate_UTF16();
 
307
        byteplex->PreparePseudoASCII_Stream();
 
308
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
 
309
        code_clocker->cc_end_interval(buffer_limit_pos);
 
310
#endif
 
311
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
312
        code_clocker->cc_start_interval();
 
313
#endif
 
314
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
 
315
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
 
316
        code_clocker->cc_end_interval(buffer_limit_pos);
 
317
#endif
 
318
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
 
319
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
 
320
        code_clocker->cc_end_interval(buffer_limit_pos);
 
321
#endif
 
322
 
 
323
}
 
324
 
 
325
template <class B, WorkingCharacterSet W>
 
326
inline unsigned char * ParsingEngine<B, W>::cur() const {
 
327
  return &((unsigned char *) x8data)[buffer_rel_pos];
 
328
}
 
329
 
 
330
template <class B, WorkingCharacterSet W>
 
331
inline int ParsingEngine<B, W>::AbsPos() const {
 
332
  return buffer_base_pos + buffer_rel_pos;
 
333
}
 
334
 
 
335
template <class B, WorkingCharacterSet W>
 
336
inline int ParsingEngine<B, W>::LengthFrom(int start_pos) const {
 
337
  return buffer_base_pos + buffer_rel_pos - start_pos;
 
338
}
 
339
 
 
340
 
 
341
 
 
342
template <class B, WorkingCharacterSet W>
 
343
inline int ParsingEngine<B, W>::BufferRelPos() const {
 
344
  return buffer_rel_pos;
 
345
}
 
346
 
 
347
 
 
348
template <class B, WorkingCharacterSet W>
 
349
inline bool ParsingEngine<B, W>::at_EOF() const {
 
350
  return (buffer_rel_pos >= buffer_limit_pos) && 
 
351
         (buffer_limit_pos < BUFFER_SIZE);
 
352
}
 
353
 
 
354
//template <class B, WorkingCharacterSet W>
 
355
//inline void ParsingEngine<B, W>::Advance(int n) {
 
356
//      buffer_rel_pos += n;
 
357
//  if (buffer_rel_pos >= BUFFER_SIZE) {        
 
358
//      Parser_Interface<W>::FinalizeBuffer_action();
 
359
//      AdvanceBuffers();
 
360
//  }
 
361
//}
 
362
 
 
363
#define Advance(n) \
 
364
do {\
 
365
        buffer_rel_pos += n; \
 
366
        if (buffer_rel_pos >= BUFFER_SIZE) {    \
 
367
                Parser_Interface<W>::FinalizeBuffer_action();\
 
368
        AdvanceBuffers();\
 
369
        }\
 
370
} while(0)
 
371
 
 
372
 
 
373
template <class B, WorkingCharacterSet W> 
 
374
void ParsingEngine<B, W>::AdjustBufferEndForIncompleteSequences() {
 
375
}
 
376
 
 
377
template <> 
 
378
void ParsingEngine<UTF8_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
 
379
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
 
380
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
 
381
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
 
382
}
 
383
 
 
384
template <> 
 
385
void ParsingEngine<U16_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
 
386
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
 
387
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
 
388
}
 
389
 
 
390
template <> 
 
391
void ParsingEngine<UTF8_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
 
392
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
 
393
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
 
394
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
 
395
}
 
396
 
 
397
template <> 
 
398
void ParsingEngine<U16_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
 
399
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
 
400
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
 
401
}
 
402
 
 
403
 
 
404
 
 
405
#ifdef OPTIMIZE_SHORT_SCAN
 
406
//
 
407
//  Inline ScanTo with unrolled first test that should almost always 
 
408
//  succeed for short scans.
 
409
#define ScanTo(item) \
 
410
do {\
 
411
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
412
        if (segment != 0) buffer_rel_pos += cfzl(segment);\
 
413
        else {\
 
414
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
415
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
416
                while (buffer_rel_pos >= BUFFER_SIZE) {\
 
417
                        buffer_rel_pos = BUFFER_SIZE;\
 
418
                        AdjustBufferEndForIncompleteSequences();\
 
419
                        Parser_Interface<W>::FinalizeBuffer_action();\
 
420
                        AdvanceBuffers();\
 
421
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
422
                }\
 
423
        }\
 
424
} while(0)
 
425
 
 
426
// The following version seems cleaner, but measured mispredictions are higher
 
427
// #define ScanTo(item) \
 
428
// do {\
 
429
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
430
//      while (unlikely (segment == 0)) {\
 
431
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
432
//              if (buffer_rel_pos >= BUFFER_SIZE) {\
 
433
//                      buffer_rel_pos = BUFFER_SIZE;\
 
434
//                      AdjustBufferEndForIncompleteSequences();\
 
435
//                      Parser_Interface<W>::FinalizeBuffer_action();\
 
436
//                      AdvanceBuffers();\
 
437
//              }\
 
438
//              segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
439
//      }\
 
440
//      buffer_rel_pos += cfzl(segment);\
 
441
// } while(0)
 
442
// 
 
443
// #define ScanTextTo(item) \
 
444
// do {\
 
445
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
 
446
//      text_or_markup_start = AbsPos();\
 
447
//      if (segment != 0) buffer_rel_pos += cfzl(segment);\
 
448
//      else {\
 
449
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
 
450
//              buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
451
//              while (buffer_rel_pos >= BUFFER_SIZE) {\
 
452
//                      buffer_rel_pos = BUFFER_SIZE;\
 
453
//                      AdjustBufferEndForIncompleteSequences();\
 
454
//                      Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);\
 
455
//                      text_or_markup_start = AbsPos();\
 
456
//                      Parser_Interface<W>::FinalizeBuffer_action();\
 
457
//                      AdvanceBuffers();\
 
458
//                      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
459
//              }\
 
460
//      }\
 
461
// } while(0)
 
462
 
 
463
template <class B, WorkingCharacterSet W>
 
464
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
 
465
        text_or_markup_start = AbsPos();
 
466
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);
 
467
        if (segment != 0) buffer_rel_pos += cfzl(segment);
 
468
        else {
 
469
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
 
470
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
471
                while (buffer_rel_pos >= BUFFER_SIZE) {
 
472
                        buffer_rel_pos = BUFFER_SIZE;
 
473
                        AdjustBufferEndForIncompleteSequences();
 
474
                        Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
475
                        text_or_markup_start = AbsPos();
 
476
                        Parser_Interface<W>::FinalizeBuffer_action();
 
477
                        AdvanceBuffers();
 
478
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
479
                }
 
480
        }
 
481
}
 
482
 
 
483
#endif
 
484
 
 
485
#ifndef OPTIMIZE_SHORT_SCAN
 
486
 
 
487
// #define ScanTo(item) \
 
488
// do {\
 
489
//   buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
490
//   while (buffer_rel_pos >= BUFFER_SIZE) {\
 
491
//      AdjustBufferEndForIncompleteSequences();\
 
492
//      Parser_Interface<W>::FinalizeBuffer_action();\
 
493
//      AdvanceBuffers();\
 
494
//      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
 
495
//   }\
 
496
// } while(0)
 
497
 
 
498
 
 
499
template <class B, WorkingCharacterSet W>
 
500
inline void ParsingEngine<B, W>::ScanTo(int item) {
 
501
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
502
        while (buffer_rel_pos >= BUFFER_SIZE) {
 
503
                AdjustBufferEndForIncompleteSequences();
 
504
                Parser_Interface<W>::FinalizeBuffer_action();
 
505
                AdvanceBuffers();
 
506
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
507
        }
 
508
}
 
509
 
 
510
template <class B, WorkingCharacterSet W>
 
511
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
 
512
        text_or_markup_start = AbsPos();
 
513
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
514
        while (buffer_rel_pos >= BUFFER_SIZE) {
 
515
                AdjustBufferEndForIncompleteSequences();
 
516
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
517
                text_or_markup_start = AbsPos();
 
518
                Parser_Interface<W>::FinalizeBuffer_action();
 
519
                AdvanceBuffers();
 
520
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
 
521
        }
 
522
}
 
523
#endif
 
524
 
 
525
template <class B, WorkingCharacterSet W>
 
526
void ParsingEngine<B, W>::WF_Error (XML_Constraint errCode) {
 
527
        printf("Error at position %i in input.\n", AbsPos());
 
528
        ShowConstraintError(errCode);
 
529
        exit(-1);
 
530
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
531
}
 
532
        
 
533
 
 
534
template <class B, WorkingCharacterSet W>
 
535
void ParsingEngine<B, W>::Validity_Error (XML_Constraint errCode) {
 
536
        printf("Error at position %i in input.\n", AbsPos());
 
537
        ShowConstraintError(errCode);
 
538
        exit(-1);
 
539
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
540
}
 
541
        
 
542
template <class B, WorkingCharacterSet W>
 
543
void ParsingEngine<B, W>::Syntax_Error (XML_NonTerminal errNT) {
 
544
        printf("Error at position %i in input.\n", AbsPos());
 
545
        ShowSyntaxError(errNT);
 
546
        exit(-1);
 
547
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
548
}
 
549
        
 
550
 
 
551
/* Parse a comment beginning "<!--" */
 
552
template <class B, WorkingCharacterSet W>
 
553
void ParsingEngine<B, W>::Parse_Comment() {
 
554
 
 
555
        Advance(4); /* Skip "<!--". */
 
556
        ScanTo(Hyphen);
 
557
        while (!at_DoubleHyphen<B::Base>(cur())) {
 
558
                if(at_EOF())
 
559
                        Syntax_Error(NT_CDSect);
 
560
                Advance(2); /* Skip hyphen-nonhyphen pair */
 
561
                ScanTo(Hyphen); 
 
562
        }
 
563
        if (at_Comment_End<B::Base>(cur())) {
 
564
                Advance(3); /* Skip "-->". */
 
565
                Comment_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
566
        }
 
567
        else {
 
568
                Advance(2);  /* "--" */
 
569
                Syntax_Error(NT_Comment);
 
570
        }
 
571
}
 
572
 
 
573
/* Parse an end tag beginning "</" */
 
574
template <class B, WorkingCharacterSet W>
 
575
inline void ParsingEngine<B, W>::Parse_EndTag() {
 
576
        Advance(2); /* Skip "</". */
 
577
        int nameID = Parse_Name();
 
578
        if (AtChar<B::Base,'>'>(cur())) {
 
579
                Advance(1);
 
580
                EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
581
        }
 
582
        else {
 
583
                ScanTo(NonWS);
 
584
                if (AtChar<B::Base,'>'>(cur())) {
 
585
                        Advance(1);
 
586
                        EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
587
                }
 
588
                else Syntax_Error(NT_ETag);
 
589
        }
 
590
}
 
591
 
 
592
/* Parse a CDATA section beginning "<![CDATA". */
 
593
template <class B, WorkingCharacterSet W>
 
594
void ParsingEngine<B, W>::Parse_CDATA() {
 
595
                Advance(8); /* Skip "<![CDATA". */
 
596
        if (!AtChar<B::Base,'['>(cur())) {
 
597
                Syntax_Error(NT_CDStart);
 
598
        }
 
599
        else {  
 
600
                Advance(1);
 
601
                CDATA_start_action(GetCodeUnitPtr(text_or_markup_start));
 
602
                text_or_markup_start = AbsPos();
 
603
                ScanTextTo(CD_End_check);
 
604
                while (!at_CDATA_End<B::Base>(cur())) {
 
605
                        if (at_EOF())
 
606
                                Syntax_Error(NT_CDSect);
 
607
                        Advance(1);
 
608
                        ScanTextTo(CD_End_check);
 
609
                }
 
610
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
 
611
                Advance(3); /* Skip "]]>". */
 
612
                CDATA_end_action(GetCodeUnitPtr(AbsPos()));
 
613
        }
 
614
}
 
615
 
 
616
template <class B, WorkingCharacterSet W>
 
617
void ParsingEngine<B, W>::Parse_EntityRef() {
 
618
    Advance(1);  // skip "&"
 
619
        int nameID = Parse_Name();  /* Name delimiter */
 
620
    if (!AtChar<B::Base,';'>(cur())) {
 
621
                Syntax_Error(NT_Reference);
 
622
    }
 
623
        else {
 
624
                Advance(1);
 
625
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
626
                
 
627
                //      The following code will replace Reference_Action.
 
628
                GEntity_info * this_info;
 
629
                Parser_Interface<W> * entity_parser;
 
630
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
 
631
                if (entityID == 0)
 
632
                        WF_Error(wfErr_wf_entdeclared);
 
633
                else{
 
634
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
635
                        if (this_info->is_external){
 
636
                                
 
637
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
638
                                WF_Error(wfErr_NoExternalRefs);
 
639
                        else {
 
640
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
641
                                        entity_parser->Parse_WF_Content();
 
642
                                        if(!entity_parser->at_EOF())
 
643
                                                Syntax_Error(NT_content);
 
644
                                        entity_parser->~Parser_Interface<W>();
 
645
                        }
 
646
                        }
 
647
                        else {
 
648
                                if (this_info->is_simple == true);
 
649
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
650
                                else{
 
651
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
652
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
653
                                        entity_parser->Parse_WF_Content();
 
654
                                        if(!entity_parser->at_EOF())
 
655
                                                Syntax_Error(NT_content);
 
656
                                        entity_parser->~Parser_Interface<W>();
 
657
                                }
 
658
                        }
 
659
                }
 
660
                
 
661
        }
 
662
}
 
663
 
 
664
template <class B, WorkingCharacterSet W>
 
665
void ParsingEngine<B, W>::Parse_EntityRef_inMixed(symbol_set_t elems) {
 
666
    Advance(1);  // skip "&"
 
667
        int nameID = Parse_Name();  /* Name delimiter */
 
668
    if (!AtChar<B::Base,';'>(cur())) {
 
669
                Syntax_Error(NT_Reference);
 
670
    }
 
671
        else {
 
672
                Advance(1);
 
673
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
674
                
 
675
                //      The following code will replace Reference_Action.
 
676
                GEntity_info * this_info;
 
677
                Parser_Interface<W> * entity_parser;
 
678
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
 
679
                if (entityID == 0)
 
680
                        WF_Error(wfErr_wf_entdeclared);
 
681
                else{
 
682
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
683
                        if (this_info->is_external){
 
684
                                
 
685
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
686
                                WF_Error(wfErr_NoExternalRefs);
 
687
                        else {
 
688
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
689
                                        entity_parser->Parse_MixedContent(elems);
 
690
                                        if(!entity_parser->at_EOF())
 
691
                                                Syntax_Error(NT_content);
 
692
                                        entity_parser->~Parser_Interface<W>();
 
693
                        }
 
694
                        }
 
695
                        else {
 
696
                                if (this_info->is_simple == true);
 
697
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
698
                                else{
 
699
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
700
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
701
                                        entity_parser->Parse_MixedContent(elems);
 
702
                                        if(!entity_parser->at_EOF())
 
703
                                                Syntax_Error(NT_content);
 
704
                                        entity_parser->~Parser_Interface<W>();
 
705
                                }
 
706
                        }
 
707
                }
 
708
                
 
709
        }
 
710
}
 
711
 
 
712
template <class B, WorkingCharacterSet W>
 
713
void ParsingEngine<B, W>::Parse_EntityRef_inAnyContent() {
 
714
    Advance(1);  // skip "&"
 
715
        int nameID = Parse_Name();  /* Name delimiter */
 
716
    if (!AtChar<B::Base,';'>(cur())) {
 
717
                Syntax_Error(NT_Reference);
 
718
    }
 
719
        else {
 
720
                Advance(1);
 
721
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
722
                
 
723
                //      The following code will replace Reference_Action.
 
724
                GEntity_info * this_info;
 
725
                Parser_Interface<W> * entity_parser;
 
726
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
 
727
                if (entityID == 0)
 
728
                        WF_Error(wfErr_wf_entdeclared);
 
729
                else{
 
730
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
731
                        if (this_info->is_external){
 
732
                                
 
733
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
734
                                WF_Error(wfErr_NoExternalRefs);
 
735
                        else {
 
736
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
737
                                        entity_parser->Parse_AnyContent();
 
738
                                        if(!entity_parser->at_EOF())
 
739
                                                Syntax_Error(NT_content);
 
740
                                        entity_parser->~Parser_Interface<W>();
 
741
                        }
 
742
                        }
 
743
                        else {
 
744
                                if (this_info->is_simple == true);
 
745
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
746
                                else{
 
747
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
748
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
749
                                        entity_parser->Parse_AnyContent();
 
750
                                        if(!entity_parser->at_EOF())
 
751
                                                Syntax_Error(NT_content);
 
752
                                        entity_parser->~Parser_Interface<W>();
 
753
                                }
 
754
                        }
 
755
                }
 
756
                
 
757
        }
 
758
}
 
759
 
 
760
template <class B, WorkingCharacterSet W>
 
761
void ParsingEngine<B, W>::Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state) {
 
762
    Advance(1);  // skip "&"
 
763
        int nameID = Parse_Name();  /* Name delimiter */
 
764
    if (!AtChar<B::Base,';'>(cur())) {
 
765
                Syntax_Error(NT_Reference);
 
766
    }
 
767
        else {
 
768
                Advance(1);
 
769
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
770
                
 
771
                //      The following code will replace Reference_Action.
 
772
                GEntity_info * this_info;
 
773
                Parser_Interface<W> * entity_parser;
 
774
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
 
775
                if (entityID == 0)
 
776
                        WF_Error(wfErr_wf_entdeclared);
 
777
                else{
 
778
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
 
779
                        if (this_info->is_external){
 
780
                                
 
781
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
782
                                WF_Error(wfErr_NoExternalRefs);
 
783
                        else {
 
784
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
785
                                        entity_parser->Parse_ValidContent(cre, cur_state);
 
786
                                        if(!entity_parser->at_EOF())
 
787
                                                Syntax_Error(NT_content);
 
788
                                        entity_parser->~Parser_Interface<W>();
 
789
                        }
 
790
                        }
 
791
                        else {
 
792
                                if (this_info->is_simple == true);
 
793
//                                      printf("Entity is %s\n",this_info->ReplacementText);
 
794
                                else{
 
795
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
 
796
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
 
797
                                        entity_parser->Parse_ValidContent(cre, cur_state);
 
798
                                        if(!entity_parser->at_EOF())
 
799
                                                Syntax_Error(NT_content);
 
800
                                        entity_parser->~Parser_Interface<W>();
 
801
                                }
 
802
                        }
 
803
                }
 
804
                
 
805
        }
 
806
}
 
807
        
 
808
template <class B, WorkingCharacterSet W>
 
809
void ParsingEngine<B, W>::Parse_CharRef() {
 
810
        Advance(2);  // skip "&#"
 
811
        int ch_val = 0;
 
812
        if (AtChar<B::Base,'x'>(cur())) {
 
813
                Advance(1);
 
814
                while(at_HexDigit<B::Base>(cur())){
 
815
                        ch_val = HexVal<B::Base>(cur()[0]) + (ch_val<<4);
 
816
                        if (ch_val> 0x10FFFF )
 
817
                                WF_Error(wfErr_wf_Legalchar);
 
818
                        Advance(1);
 
819
                }
 
820
        }
 
821
        else {
 
822
                while(at_Digit<B::Base>(cur())){
 
823
                        ch_val = DigitVal<B::Base>(cur()[0]) + ch_val*10;
 
824
                        if (ch_val> 0x10FFFF )
 
825
                                WF_Error(wfErr_wf_Legalchar);
 
826
                        Advance(1);
 
827
                }
 
828
        }
 
829
        if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF))
 
830
                                WF_Error(wfErr_wf_Legalchar);    
 
831
                else  if (Parser_Interface<W>::entity_Info->version != XML_1_1)
 
832
                        if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA)))
 
833
                                WF_Error(wfErr_wf_Legalchar); 
 
834
                                
 
835
        if (!AtChar<B::Base,';'>(cur())) {
 
836
                        Syntax_Error(NT_CharRef);
 
837
        }
 
838
                else {
 
839
                        Advance(1);
 
840
                        Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
841
                }
 
842
}
 
843
 
 
844
template <class B, WorkingCharacterSet W>
 
845
void ParsingEngine<B, W>::Parse_PI (){
 
846
        int nameID;
 
847
        Advance(2); /* Skip "<?". */
 
848
        int target_start = AbsPos();
 
849
        if (at_XxMmLll<B::Base>(cur())) {
 
850
                nameID = Parse_Name();
 
851
                if (AbsPos() - target_start == 3) Syntax_Error(NT_PI);
 
852
        }
 
853
        else nameID = Parse_Name();
 
854
        PI_Target_action(GetCodeUnitPtr(target_start), LengthFrom(target_start));
 
855
        if (!at_PI_End<B::Base>(cur())) requireWS();
 
856
        ScanTo(QMark);
 
857
        while (!at_PI_End<B::Base>(cur())) {
 
858
                if(at_EOF())
 
859
                        Syntax_Error(NT_PI);
 
860
                Advance(1);
 
861
                ScanTo(QMark);
 
862
        }
 
863
        Advance(2); /* Skip "?>". */
 
864
        PI_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
865
}
 
866
 
 
867
/* Parse a start or empty element tag. */
 
868
template <class B, WorkingCharacterSet W>
 
869
inline void ParsingEngine<B, W>::Parse_StartTag (){
 
870
        int att_name_start;
 
871
        int att_val_start;
 
872
        int att_name_end, att_val_end;
 
873
        unsigned char quoteCh;
 
874
        Advance(1);
 
875
        int nameID = Parse_Name();  /* Name delimiter: WS, "/" or ">" */
 
876
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
877
        /* The following test optimizes the most common case of a
 
878
        start tag with no attributes.  */
 
879
        if (AtChar<B::Base,'>'>(cur())) {
 
880
                Advance(1);
 
881
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
882
        }
 
883
        else {
 
884
                ScanTo(NonWS);
 
885
                if (AtChar<B::Base,'>'>(cur())) {
 
886
                        Advance(1);
 
887
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
888
                }
 
889
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
890
                        Advance(2);
 
891
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
892
                }
 
893
                else do {
 
894
                        /* Must be an attribute-value pair or error. */
 
895
                        att_name_start = AbsPos();
 
896
                        int att_nameID = Parse_Name();
 
897
                        att_name_end = AbsPos();
 
898
                
 
899
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
900
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
901
                        else {
 
902
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
903
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
904
                                        break;
 
905
                                }                       
 
906
                        }
 
907
                        LastAttOccurrence[attID] = att_name_start;
 
908
                        /* The following optimized tests handle the frequently occurring 
 
909
                        case that there are no blanks on either side of the equals sign.
 
910
                        In many cases, the very first test handles 100% of actual
 
911
                        attribute-value pairs encountered. */
 
912
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
 
913
                        else {
 
914
                                ScanTo(NonWS);
 
915
                                if (!AtChar<B::Base,'='>(cur())) {
 
916
                                        Syntax_Error(NT_STag); 
 
917
                                        break;
 
918
                                }
 
919
                                Advance(1);
 
920
                                ScanTo(NonWS);
 
921
                                if (!AtQuote<B::Base>(cur())) {
 
922
                                        Syntax_Error(NT_STag); 
 
923
                                        break;
 
924
                                }
 
925
                        }
 
926
                        att_val_start = AbsPos()+1;
 
927
                        Parse_AttValue();
 
928
                        att_val_end = AbsPos()-1;
 
929
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
930
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
931
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
932
                        }
 
933
                        else {
 
934
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
935
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
936
                        }
 
937
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
938
                        if (AtChar<B::Base,'>'>(cur())) {
 
939
                                Advance(1);
 
940
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
941
                                break;
 
942
                        }
 
943
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
944
                                Advance(2);
 
945
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
946
                                break;
 
947
                        }
 
948
                        ScanTo(NonWS);
 
949
                        if (AtChar<B::Base,'>'>(cur())) {
 
950
                                Advance(1);
 
951
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
952
                                break;
 
953
                        }
 
954
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
955
                                Advance(2);
 
956
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
957
                                break;
 
958
                        }
 
959
                        else if (AbsPos() == att_val_end + 1) { 
 
960
                                /* No WS following att value */
 
961
                                Syntax_Error(NT_STag);
 
962
                                break;
 
963
                        }
 
964
                } while (1);
 
965
        }
 
966
}
 
967
 
 
968
template <class B, WorkingCharacterSet W>
 
969
inline void ParsingEngine<B, W>::text_if_nonnull_action(bool more){
 
970
        if (AbsPos() > text_or_markup_start) {
 
971
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), more);
 
972
                text_or_markup_start = AbsPos();
 
973
        }
 
974
}
 
975
 
 
976
template <class B, WorkingCharacterSet W>
 
977
void ParsingEngine<B, W>::Parse_WF_EndTag(int nameID) {
 
978
        Advance(2);
 
979
        int end_nameID = Parse_Name();
 
980
        if(end_nameID != nameID)
 
981
                WF_Error(wfErr_GIMatch);
 
982
        if (AtChar<B::Base,'>'>(cur())) {
 
983
                Advance(1);
 
984
                Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
985
        }
 
986
    else {
 
987
                ScanTo(NonWS);
 
988
                if (AtChar<B::Base,'>'>(cur())) {
 
989
                        Advance(1);
 
990
                        Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
991
                }
 
992
                else Syntax_Error(NT_ETag);
 
993
    }
 
994
}
 
995
 
 
996
// template <>
 
997
// void ParsingEngine<UTF8_Buffer, UTF_8>::Parse_WF_EndTag(int nameID) {
 
998
//      Advance(2); /* Skip "</". */
 
999
//      
 
1000
//      int name_start = AbsPos();
 
1001
// //   ScanTo(NameFollow);
 
1002
// //   int lgth = AbsPos()-name_start;
 
1003
// 
 
1004
// #if (not defined(OMISSION)) or ((OMISSION != END_TAG_MATCHING)  and (OMISSION != NAME_LOOKUP))
 
1005
//      char * start_elem_name = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_name(nameID);
 
1006
//      int lgth = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_lgth(nameID);
 
1007
//      char * end_elem_name = &((char *) x8data)[buffer_rel_pos];
 
1008
//      
 
1009
// #ifdef TEMPLATED_SIMD_LIB    
 
1010
//      BytePack byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) end_elem_name),
 
1011
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
 
1012
// #endif
 
1013
// #ifndef TEMPLATED_SIMD_LIB   
 
1014
//      BytePack byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) end_elem_name),
 
1015
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
 
1016
// #endif
 
1017
//      if (lgth < 16) {
 
1018
//              int expected_bits = ~(-1 << lgth);
 
1019
//          if ((_mm_movemask_epi8(byte_compare) & expected_bits) != expected_bits) {
 
1020
//                      WF_Error(wfErr_GIMatch);
 
1021
//          }
 
1022
//      }
 
1023
//      else {
 
1024
//          /* Must compare with bytes beyond the first 16.  Set up to
 
1025
//             compare 16 bytes at a time, with the first additional compare
 
1026
//             overlapping with the first byte_compare. */
 
1027
//          int pos = (lgth - 1) % PACKSIZE + 1;
 
1028
// #ifdef TEMPLATED_SIMD_LIB
 
1029
//          byte_compare =  simd_or(byte_compare, simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1030
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
 
1031
// #endif
 
1032
// #ifndef TEMPLATED_SIMD_LIB
 
1033
//          byte_compare =  simd_or(byte_compare, simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1034
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
 
1035
// #endif
 
1036
//          pos += 16;
 
1037
//          while (pos < lgth) {
 
1038
//              if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
 
1039
//                      WF_Error(wfErr_GIMatch);
 
1040
//              }
 
1041
// #ifdef TEMPLATED_SIMD_LIB
 
1042
//              byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1043
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
 
1044
// #endif
 
1045
// #ifndef TEMPLATED_SIMD_LIB
 
1046
//              byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
 
1047
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
 
1048
// #endif
 
1049
//              pos += 16;
 
1050
//          }
 
1051
//          if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
 
1052
//                      WF_Error(wfErr_GIMatch);
 
1053
//          }
 
1054
//      }
 
1055
//      Advance(lgth);
 
1056
// 
 
1057
// #endif
 
1058
// #if defined(OMISSION) and ((OMISSION == END_TAG_MATCHING) or (OMISSION == NAME_LOOKUP))
 
1059
//      ScanTo(NameFollow);
 
1060
// #endif
 
1061
// //   for(int i=0; i<lgth; i++) {
 
1062
// //           if (start_elem_name[i] != end_elem_name[i])
 
1063
// //                   WF_Error(wfErr_GIMatch);
 
1064
// //   }
 
1065
// //   if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
 
1066
// 
 
1067
//      if (AtChar<ASCII,'>'>(cur())) {
 
1068
//              Advance(1);
 
1069
//              Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1070
//      }
 
1071
//     else {
 
1072
//              ScanTo(NonWS);
 
1073
//              if (AtChar<ASCII,'>'>(cur())) {
 
1074
//                      Advance(1);
 
1075
//                      Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1076
//              }
 
1077
//              else Syntax_Error(NT_ETag);
 
1078
//     }
 
1079
// }
 
1080
 
 
1081
 
 
1082
/* Parse a valid start or empty element tag. */
 
1083
template <class B, WorkingCharacterSet W>
 
1084
int ParsingEngine<B, W>::Parse_WF_StartTag (bool& is_emptyStartTag){
 
1085
        int att_name_start;
 
1086
        int att_val_start;
 
1087
        int att_name_end, att_val_end;
 
1088
        unsigned char quoteCh;
 
1089
        Advance(1);
 
1090
        
 
1091
        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
 
1092
        int nameID = Parse_Name(); 
 
1093
        #endif
 
1094
        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
 
1095
        ScanTo(NameFollow);
 
1096
        int nameID = 0;
 
1097
        #endif
 
1098
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
1099
        /* The following test optimizes the most common case of a
 
1100
        start tag with no attributes.  */
 
1101
        if (AtChar<B::Base,'>'>(cur())) {
 
1102
                Advance(1);
 
1103
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1104
        }
 
1105
        else {
 
1106
                ScanTo(NonWS);
 
1107
                if (AtChar<B::Base,'>'>(cur())) {
 
1108
                        Advance(1);
 
1109
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1110
                }
 
1111
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1112
                        Advance(2);
 
1113
                        is_emptyStartTag = true;
 
1114
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1115
                }
 
1116
                else do {
 
1117
                        /* Must be an attribute-value pair or error. */
 
1118
                        att_name_start = AbsPos();
 
1119
                        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
 
1120
                        int att_nameID = Parse_Name(); 
 
1121
                        #endif
 
1122
                        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
 
1123
                        ScanTo(NameFollow);
 
1124
                        int att_nameID = 0;
 
1125
                        #endif
 
1126
            att_name_end = AbsPos();
 
1127
                #if (not defined(OMISSION)) or ((OMISSION != ATTRIBUTE_UNIQUENESS) and (OMISSION != NAME_LOOKUP))
 
1128
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
1129
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
1130
                        else {
 
1131
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
1132
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
1133
                                        break;
 
1134
                                }                       
 
1135
                        }
 
1136
                        LastAttOccurrence[attID] = att_name_start;
 
1137
                 #endif
 
1138
                        /* The following optimized tests handle the frequently occurring 
 
1139
                        case that there are no blanks on either side of the equals sign.
 
1140
                        In many cases, the very first test handles 100% of actual
 
1141
                        attribute-value pairs encountered. */
 
1142
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
 
1143
                        else {
 
1144
                                ScanTo(NonWS);
 
1145
                                if (!AtChar<B::Base,'='>(cur())) {
 
1146
                                        Syntax_Error(NT_STag); 
 
1147
                                        break;
 
1148
                                }
 
1149
                                Advance(1);
 
1150
                                ScanTo(NonWS);
 
1151
                                if (!AtQuote<B::Base>(cur())) {
 
1152
                                        Syntax_Error(NT_STag); 
 
1153
                                        break;
 
1154
                                }
 
1155
                        }
 
1156
                        att_val_start = AbsPos()+1;
 
1157
                        Parse_AttValue();
 
1158
                        att_val_end = AbsPos()-1;
 
1159
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
1160
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
1161
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
1162
                        }
 
1163
                        else {
 
1164
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
1165
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
1166
                        }
 
1167
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
1168
                        if (AtChar<B::Base,'>'>(cur())) {
 
1169
                                Advance(1);
 
1170
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1171
                                break;
 
1172
                        }
 
1173
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1174
                                Advance(2);
 
1175
                                is_emptyStartTag = true;        
 
1176
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1177
                                break;
 
1178
                        }
 
1179
                        ScanTo(NonWS);
 
1180
                        if (AtChar<B::Base,'>'>(cur())) {
 
1181
                                Advance(1);
 
1182
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1183
                                break;
 
1184
                        }
 
1185
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
1186
                                Advance(2);
 
1187
                                is_emptyStartTag = true;
 
1188
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1189
                                break;
 
1190
                        }
 
1191
                        else if (AbsPos() == att_val_end + 1) { 
 
1192
                                /* No WS following att value */
 
1193
                                Syntax_Error(NT_STag);
 
1194
                                break;
 
1195
                        }
 
1196
                } while (1);
 
1197
        }
 
1198
        return nameID;
 
1199
}
 
1200
 
 
1201
 
 
1202
 
 
1203
template <class B, WorkingCharacterSet W>
 
1204
void ParsingEngine<B, W>::Parse_WF_Element() {
 
1205
        bool is_emptyStartTag = false;
 
1206
        int nameID = Parse_WF_StartTag(is_emptyStartTag);
 
1207
#ifdef DEBUG
 
1208
        printf("Parse_Element: nameID = %d, is_emptyStartTag=%i\n",nameID, is_emptyStartTag);
 
1209
#endif
 
1210
        if (!is_emptyStartTag) {
 
1211
                Parse_WF_Content();
 
1212
                Parse_WF_EndTag(nameID);
 
1213
        }
 
1214
}
 
1215
 
 
1216
 
 
1217
template <class B, WorkingCharacterSet W>
 
1218
void ParsingEngine<B, W>::Parse_WF_Content() {
 
1219
        do {
 
1220
                text_or_markup_start = AbsPos();
 
1221
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
1222
                if (at_ElementTag_Start<B::Base>(cur())) {
 
1223
                        text_if_nonnull_action(false);
 
1224
                        Parse_WF_Element();
 
1225
                }
 
1226
                else if (at_EndTag_Start<B::Base>(cur())) {
 
1227
                        text_if_nonnull_action(false);
 
1228
                        return;
 
1229
                }
 
1230
                else if (at_Comment_Start<B::Base>(cur())) {
 
1231
                        text_if_nonnull_action(false);
 
1232
                        Parse_Comment();
 
1233
                }
 
1234
                else if (at_CharRef_Start<B::Base>(cur())) {
 
1235
                        text_if_nonnull_action(true);
 
1236
                        Parse_CharRef();
 
1237
                }
 
1238
                else if (AtChar<B::Base,'&'>(cur())) {
 
1239
                        text_if_nonnull_action(true);
 
1240
                        Parse_EntityRef();
 
1241
                }
 
1242
                else if (at_CDATA_Start<B::Base>(cur())) {
 
1243
                        text_if_nonnull_action(true);
 
1244
                        Parse_CDATA();
 
1245
                }
 
1246
                else if (at_PI_Start<B::Base>(cur())) {
 
1247
                        text_if_nonnull_action(false);
 
1248
                        Parse_PI();
 
1249
                }
 
1250
                else if (at_CDATA_End<B::Base>(cur())) {
 
1251
                        text_if_nonnull_action(true);
 
1252
                        Advance(3);
 
1253
                        Syntax_Error(NT_CharData);
 
1254
                }
 
1255
                else if (at_EOF()) {
 
1256
                        text_if_nonnull_action(false);
 
1257
                        return;
 
1258
                }
 
1259
                else if (AtChar<B::Base,'<'>(cur())) {
 
1260
                        Syntax_Error(NT_markupdecl);
 
1261
                }
 
1262
                else {
 
1263
                        Advance(1);
 
1264
                        continue;
 
1265
                }
 
1266
        } while (1);
 
1267
}
 
1268
 
 
1269
 
 
1270
#ifndef MARKUP_PASS_CONTROL
 
1271
#ifndef MARKUP_SORTING
 
1272
template <class B, WorkingCharacterSet W>
 
1273
void ParsingEngine<B, W>::ParseContent() {
 
1274
        Parser_Interface<W>::DocumentStart_action();    
 
1275
        bool is_emptyStartTag = false;
 
1276
        do {
 
1277
                text_or_markup_start = AbsPos();
 
1278
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
1279
/*              if (AtChar<B::Base,'<'>(cur())) {
 
1280
                        text_if_nonnull_action();
 
1281
                        Parse_Markup<B, W>();
 
1282
                }*/
 
1283
                if (at_ElementTag_Start<B::Base>(cur())) {
 
1284
                        text_if_nonnull_action(false);
 
1285
                        Parse_StartTag();
 
1286
                }
 
1287
                else if (at_EndTag_Start<B::Base>(cur())) {
 
1288
                        text_if_nonnull_action(false);
 
1289
                        Parse_EndTag();
 
1290
                }
 
1291
                else if (at_Comment_Start<B::Base>(cur())) {
 
1292
                        text_if_nonnull_action(false);
 
1293
                        Parse_Comment();
 
1294
                }
 
1295
                else if (at_CharRef_Start<B::Base>(cur())) {
 
1296
                        text_if_nonnull_action(true);
 
1297
                        Parse_CharRef();
 
1298
                }
 
1299
                else if (AtChar<B::Base,'&'>(cur())) {
 
1300
                        text_if_nonnull_action(true);
 
1301
                        Parse_EntityRef();
 
1302
                }
 
1303
                else if (at_CDATA_Start<B::Base>(cur())) {
 
1304
                        text_if_nonnull_action(true);
 
1305
                        Parse_CDATA();
 
1306
                }
 
1307
                else if (at_PI_Start<B::Base>(cur())) {
 
1308
                        text_if_nonnull_action(false);
 
1309
                        Parse_PI();
 
1310
                }
 
1311
                else if (at_CDATA_End<B::Base>(cur())) {
 
1312
                        text_if_nonnull_action(true);
 
1313
                        Advance(3);
 
1314
                        Syntax_Error(NT_CharData);
 
1315
                }
 
1316
                else if (at_EOF()) {
 
1317
                        text_if_nonnull_action(false);
 
1318
                        break;
 
1319
                }
 
1320
                else if (AtChar<B::Base,'<'>(cur())) {
 
1321
                        Syntax_Error(NT_markupdecl);
 
1322
                }
 
1323
                else {
 
1324
                        Advance(1);
 
1325
                        continue;
 
1326
                }
 
1327
        } while (1);
 
1328
        Parser_Interface<W>::DocumentEnd_action();      
 
1329
}
 
1330
#endif
 
1331
#endif
 
1332
 
 
1333
template <class B, WorkingCharacterSet W>
 
1334
void ParsingEngine<B, W>::Parse_DocType (){
 
1335
 
 
1336
        int old_abspos, start_pos;
 
1337
        ScanTo(NonWS);
 
1338
        start_pos = AbsPos();
 
1339
        
 
1340
        if (at_DOCTYPE_start<B::Base>(cur()))
 
1341
        Advance(9);
 
1342
        else{
 
1343
//              printf("No Document definition!\n");
 
1344
                return;
 
1345
        }
 
1346
        requireWS();
 
1347
        int nameID = Parse_Name();
 
1348
 
 
1349
        old_abspos = AbsPos();  
 
1350
    ScanTo(NonWS);
 
1351
    if(at_SYSTEM<B::Base>(cur())||at_PUBLIC<B::Base>(cur())){
 
1352
        Parser_Interface<W>::model_info->has_external_DTD = true;
 
1353
        if(old_abspos == AbsPos())
 
1354
                Syntax_Error(NT_doctypedecl);
 
1355
        Parse_ExternalID(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info->external_DTD_pubidLiteral);
 
1356
        Parser_Interface<W> * entity_parser;
 
1357
        entity_parser = ParserFactory(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info);
 
1358
                entity_parser->Parse_ExtSubsetDecl();
 
1359
                entity_parser->~Parser_Interface<W>();
 
1360
    }
 
1361
    else Parser_Interface<W>::model_info->has_external_DTD = false;
 
1362
    ScanTo(NonWS);      
 
1363
 
 
1364
        if (AtChar<B::Base,'['>(cur())){
 
1365
                Advance(1);
 
1366
                Parse_IntSubset();
 
1367
                if (AtChar<B::Base,']'>(cur()))
 
1368
                        Advance(1);
 
1369
                else
 
1370
                Syntax_Error(NT_doctypedecl);
 
1371
                ScanTo(NonWS);
 
1372
        }
 
1373
        
 
1374
        if (AtChar<B::Base,'>'>(cur())){
 
1375
                Advance(1);  
 
1376
 
 
1377
                CRE_Seq * rslt = new CRE_Seq();
 
1378
                rslt->subCMs.push_back(new CRE_Name(nameID));
 
1379
                CM_RegExp * cre = new CM_RegExp();
 
1380
                cre->content_re = rslt;         
 
1381
                
 
1382
                int id_count = cre->content_re->Set_IDs(0);
 
1383
                cre->content_re->Set_First_Map();               
 
1384
                symbol_set_t * transition_map = new symbol_set_t[id_count+1];
 
1385
                cre->content_re->follow_map[0] = id_count+1;
 
1386
                
 
1387
                cre->content_re->Set_Follow_Map(transition_map);
 
1388
                transition_map[0] = cre->content_re->first_map;
 
1389
                if (cre->content_re->matches_empty)
 
1390
                        transition_map[0][0]=id_count+1;
 
1391
                        
 
1392
                cre -> transition_map = transition_map;
 
1393
                
 
1394
                Parser_Interface<W>::model_info->rootModel = cre;
 
1395
                
 
1396
                /* Check for notations that were used, but not defined by the end of the DTD. */
 
1397
                #if (VALIDATION_MODE == ON)
 
1398
                hash_map<int, int >::iterator j;
 
1399
                for (j=Parser_Interface<W>::model_info->GlobalNotationTable.begin(); j!=Parser_Interface<W>::model_info->GlobalNotationTable.end(); j++) {
 
1400
                        if (j->second == -1)
 
1401
                                Validity_Error(vErr_notatn);
 
1402
                }
 
1403
                #endif
 
1404
        }
 
1405
        else
 
1406
                Syntax_Error(NT_doctypedecl);   
 
1407
}
 
1408
 
 
1409
template <class B, WorkingCharacterSet W>
 
1410
void ParsingEngine<B, W>::Parse_ExternalID (char *& systemLiteral, char *& pubidLiteral){
 
1411
        int quot_start, lgth;
 
1412
        if(at_SYSTEM<B::Base>(cur())){
 
1413
                Advance(6);
 
1414
                pubidLiteral = NULL;
 
1415
                requireWS();
 
1416
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
 
1417
                quot_start = AbsPos()+1;
 
1418
                Parse_SystemLiteral (); /*  SystemLiteral */
 
1419
                lgth = AbsPos() - quot_start - 1;                       
 
1420
                systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1421
        }
 
1422
        else if (at_PUBLIC<B::Base>(cur())){
 
1423
                Advance(6);
 
1424
                requireWS();
 
1425
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
 
1426
                quot_start = AbsPos()+1;
 
1427
                Parse_PubidLiteral ();/*  PubidLiteral */
 
1428
                lgth = AbsPos() - quot_start - 1;                       
 
1429
                pubidLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1430
                systemLiteral = NULL;
 
1431
                if (AtChar<B::Base, '>'>(cur())) return;
 
1432
                requireWS();
 
1433
                if (AtQuote<B::Base>(cur())) {
 
1434
                        quot_start = AbsPos()+1;        
 
1435
                        Parse_SystemLiteral ();/*  SystemLiteral */
 
1436
                        lgth = AbsPos() - quot_start - 1;                       
 
1437
                        systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
 
1438
                }
 
1439
        }
 
1440
        else
 
1441
                Syntax_Error(NT_ExternalID); 
 
1442
}
 
1443
 
 
1444
template <class B, WorkingCharacterSet W>
 
1445
void ParsingEngine<B, W>::Parse_SystemLiteral (){
 
1446
        unsigned char quoteCh;
 
1447
        if(AtQuote<B::Base>(cur())){
 
1448
                quoteCh = cur()[0];
 
1449
                Advance(1);
 
1450
        }       
 
1451
        ScanTo(Quote);                  
 
1452
        while (cur()[0] != quoteCh){
 
1453
                if(at_EOF())
 
1454
                        Syntax_Error(NT_SystemLiteral);
 
1455
                Advance(1);
 
1456
                ScanTo(Quote);
 
1457
        }
 
1458
        Advance(1);
 
1459
}
 
1460
 
 
1461
template <class B, WorkingCharacterSet W>
 
1462
void ParsingEngine<B, W>::Parse_PubidLiteral (){
 
1463
        unsigned char quoteCh;
 
1464
        quoteCh = cur()[0];
 
1465
        Advance(1);
 
1466
        while (at_PubidChar<B::Base>(cur()) && (cur()[0] != quoteCh)) {
 
1467
                Advance(1);
 
1468
        }
 
1469
        if (cur()[0] != quoteCh){
 
1470
                Syntax_Error(NT_PubidLiteral);
 
1471
        }
 
1472
        Advance(1);
 
1473
}
 
1474
 
 
1475
template <class B, WorkingCharacterSet W>
 
1476
void ParsingEngine<B, W>::Parse_IntSubset (){
 
1477
        
 
1478
        while(1){
 
1479
                ScanTo(NonWS);  
 
1480
                text_or_markup_start = AbsPos();
 
1481
                if (AtChar<B::Base,'%'>(cur()))
 
1482
                        Parse_PEReference();    
 
1483
                else if (at_PI_Start<B::Base>(cur())) {
 
1484
                        Parse_PI();
 
1485
                }
 
1486
                else if (at_Comment_Start<B::Base>(cur())) {
 
1487
                        Parse_Comment();
 
1488
                }
 
1489
                else if (AtChar<B::Base,'<'>(cur())){
 
1490
                        Advance(1);
 
1491
                        if(AtChar<B::Base,'!'>(cur())){
 
1492
                                Advance(1);
 
1493
                                if (at_ELEMENT<B::Base>(cur()))
 
1494
                                        Parse_Elementdecl();
 
1495
                                else if (at_ATTLIST<B::Base>(cur()))
 
1496
                                        Parse_AttlistDecl();
 
1497
                                else if (at_ENTITY<B::Base>(cur()))
 
1498
                                        Parse_Entitydecl();
 
1499
                                else if (at_NOTATION<B::Base>(cur()))
 
1500
                                        Parse_Notationdecl();
 
1501
                                else {
 
1502
                                        Syntax_Error(NT_markupdecl);            
 
1503
                                }                                                               
 
1504
                        }
 
1505
                        else
 
1506
                                Syntax_Error(NT_markupdecl); 
 
1507
                }
 
1508
                else if (AtChar<B::Base,']'>(cur())){
 
1509
                        break;
 
1510
                }
 
1511
                else
 
1512
                        Syntax_Error(NT_intSubset); 
 
1513
        }
 
1514
}
 
1515
 
 
1516
 
 
1517
template <class B, WorkingCharacterSet W>
 
1518
void ParsingEngine<B, W>::Parse_PEReference (){
 
1519
 
 
1520
        Advance(1); /* Skip "%". */
 
1521
        fprintf(stderr,"Parameter Reference has not been completed yet.\n");
 
1522
        exit(-1);
 
1523
        int nameID = Parse_Name(); 
 
1524
        if (AtChar<B::Base,';'>(cur())) {
 
1525
                Advance(1);
 
1526
                PEReference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
1527
                PEntity_info * this_info;
 
1528
                Parser_Interface<W> * entity_parser;
 
1529
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]; 
 
1530
                if (entityID == 0)
 
1531
                        WF_Error(wfErr_wf_entdeclared);
 
1532
                else{
 
1533
                        this_info = Parser_Interface<W>::model_info->PEntityData[entityID-1];
 
1534
                        if (this_info->is_external){
 
1535
                                
 
1536
//                      if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
 
1537
//                              WF_Error(wfErr_NoExternalRefs);
 
1538
//                      else {
 
1539
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
 
1540
                                        entity_parser->Parse_WF_Content();
 
1541
                                        if(!entity_parser->at_EOF())
 
1542
                                                Syntax_Error(NT_content);
 
1543
                                        entity_parser->~Parser_Interface<W>();
 
1544
//                      }
 
1545
                        }
 
1546
                        else {
 
1547
                        }
 
1548
                }
 
1549
        }
 
1550
        else
 
1551
                Syntax_Error(NT_PEReference);
 
1552
}
 
1553
 
 
1554
 
 
1555
template <class B, WorkingCharacterSet W>
 
1556
void ParsingEngine<B, W>::Parse_Elementdecl (){
 
1557
 
 
1558
        Advance(7); /* Skip "<!ELEMENT". */
 
1559
 
 
1560
    requireWS();
 
1561
        int nameID = Parse_Name();
 
1562
        int elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
 
1563
 
 
1564
        requireWS();
 
1565
        ContentModel * cm;
 
1566
        /* Start parsing "contentspec"*/
 
1567
        if (at_EMPTY<B::Base>(cur())) {
 
1568
        Advance(5);
 
1569
        cm = new CM_Empty();
 
1570
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1571
        }
 
1572
    else if (at_ANY<B::Base>(cur())) {
 
1573
        Advance(3);
 
1574
        cm = new CM_Any();
 
1575
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1576
    }
 
1577
    else {
 
1578
        if (AtChar<B::Base,'('>(cur()))
 
1579
                        Advance(1);
 
1580
                ScanTo(NonWS);
 
1581
                if (at_PCDATA<B::Base>(cur())){
 
1582
                        cm = Parse_RemainingMixed();
 
1583
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
 
1584
                }
 
1585
                else{
 
1586
 
 
1587
                        CM_RegExp * cre = new CM_RegExp;
 
1588
                        cre->content_re = Parse_RemainingChildren();
 
1589
 
 
1590
                        int id_count = cre->content_re->Set_IDs(0);
 
1591
                        cre->content_re->Set_First_Map();       
 
1592
                        symbol_set_t * transition_map = new symbol_set_t[id_count+1];
 
1593
                        cre->content_re->follow_map[0] = id_count+1;
 
1594
                        
 
1595
                        cre->content_re->Set_Follow_Map(transition_map);
 
1596
                        transition_map[0] = cre->content_re->first_map;
 
1597
                        
 
1598
                        if (cre->content_re->matches_empty)
 
1599
                                transition_map[0][0]=id_count+1;
 
1600
                                
 
1601
                        cre -> transition_map = transition_map;
 
1602
                        
 
1603
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cre;
 
1604
                        cm = cre;
 
1605
                }                       
 
1606
    }
 
1607
    ScanTo(NonWS);    
 
1608
 
 
1609
        if (AtChar<B::Base,'>'>(cur())) {
 
1610
                Advance(1);
 
1611
        }
 
1612
        else
 
1613
                Syntax_Error(NT_elementdecl);
 
1614
}
 
1615
template <class B, WorkingCharacterSet W>
 
1616
ContentModel * ParsingEngine<B, W>::Parse_RemainingMixed (){
 
1617
        CM_Mixed * r = new CM_Mixed();
 
1618
        Advance(7);  /* Skip "#PCDATA". */
 
1619
    
 
1620
    if (AtChar<B::Base,')'>(cur())){
 
1621
        if (AtChar<B::Base,'*'>(cur())) {
 
1622
                Advance(2);
 
1623
                }
 
1624
                else {
 
1625
                        Advance(1);
 
1626
                }
 
1627
    }
 
1628
    else{
 
1629
        ScanTo(NonWS);
 
1630
        int k = 0;
 
1631
        while (AtChar<B::Base,'|'>(cur())){
 
1632
                        Advance(1);
 
1633
                        ScanTo(NonWS);
 
1634
                        int nameID = Parse_Name();
 
1635
                        r->elements[nameID] = ++k;
 
1636
                        ScanTo(NonWS);
 
1637
                }
 
1638
                if (at_Para_star<B::Base>(cur())) Advance(2);
 
1639
                else {
 
1640
                        Syntax_Error(NT_Mixed);
 
1641
                        exit(-1);
 
1642
        }
 
1643
    }
 
1644
    return r;
 
1645
}
 
1646
 
 
1647
 
 
1648
template <class B, WorkingCharacterSet W>
 
1649
Content_RE * ParsingEngine<B, W>::Parse_RemainingChildren (){
 
1650
        Content_RE * c1 = Parse_Cp();
 
1651
        Content_RE * r = c1;
 
1652
        ScanTo(NonWS);
 
1653
        if(AtChar<B::Base,'|'>(cur())){
 
1654
                CRE_Choice * rslt = new CRE_Choice;
 
1655
                rslt->subCMs.push_back(c1);
 
1656
                Advance(1);
 
1657
                ScanTo(NonWS);
 
1658
                rslt->subCMs.push_back(Parse_Cp());
 
1659
                ScanTo(NonWS);
 
1660
                while(!AtChar<B::Base,')'>(cur())){
 
1661
                        if(AtChar<B::Base,'|'>(cur()))
 
1662
                                Advance(1);
 
1663
                        else
 
1664
                                Syntax_Error(NT_children);
 
1665
                        ScanTo(NonWS);
 
1666
                        rslt->subCMs.push_back(Parse_Cp());
 
1667
                        ScanTo(NonWS);
 
1668
                }
 
1669
                Advance(1);
 
1670
                rslt->Compile();
 
1671
                r = rslt;
 
1672
        }
 
1673
        else if(AtChar<B::Base,','>(cur())){
 
1674
                CRE_Seq * rslt = new CRE_Seq;
 
1675
                rslt->subCMs.push_back(c1);
 
1676
                Advance(1);
 
1677
                ScanTo(NonWS);
 
1678
                rslt->subCMs.push_back(Parse_Cp());
 
1679
                ScanTo(NonWS);
 
1680
                while(!AtChar<B::Base,')'>(cur())){
 
1681
                        if(AtChar<B::Base,','>(cur()))
 
1682
                                Advance(1);
 
1683
                        else
 
1684
                                Syntax_Error(NT_children);
 
1685
                        ScanTo(NonWS);
 
1686
                        rslt->subCMs.push_back(Parse_Cp());
 
1687
                        ScanTo(NonWS);
 
1688
                }
 
1689
                Advance(1);
 
1690
                rslt->Compile();
 
1691
                r = rslt;
 
1692
        }       
 
1693
        else if(AtChar<B::Base,')'>(cur())){
 
1694
                Advance(1);
 
1695
        }
 
1696
        else
 
1697
                Syntax_Error(NT_children);
 
1698
                
 
1699
        if (AtChar<B::Base,'?'>(cur())) {
 
1700
                Advance(1);
 
1701
                r = new CRE_Opt(r);
 
1702
        }
 
1703
        else if (AtChar<B::Base,'*'>(cur())) {
 
1704
                Advance(1);
 
1705
                r = new CRE_Star(r);
 
1706
        }
 
1707
        else if (AtChar<B::Base,'+'>(cur())) {
 
1708
                Advance(1);
 
1709
                r = new CRE_Plus(r);
 
1710
        }
 
1711
 
 
1712
        return r;
 
1713
}
 
1714
 
 
1715
template <class B, WorkingCharacterSet W>
 
1716
Content_RE * ParsingEngine<B, W>::Parse_Cp (){
 
1717
        if (AtChar<B::Base,'('>(cur())){
 
1718
                Advance(1);
 
1719
                ScanTo(NonWS);
 
1720
                Parse_RemainingChildren();
 
1721
        }
 
1722
        else{
 
1723
                int nameID = Parse_Name();
 
1724
                CRE_Name * r = new CRE_Name(nameID);
 
1725
 
 
1726
                if (AtChar<B::Base,'?'>(cur())) {
 
1727
                        Advance(1);
 
1728
                        return new CRE_Opt(r);
 
1729
                }
 
1730
                else if (AtChar<B::Base,'*'>(cur())) {
 
1731
                        Advance(1);
 
1732
                        return new CRE_Star(r);
 
1733
                }
 
1734
                else if (AtChar<B::Base,'+'>(cur())) {
 
1735
                        Advance(1);
 
1736
                        return new CRE_Plus(r);
 
1737
                }
 
1738
                else return r;
 
1739
        }
 
1740
}
 
1741
 
 
1742
template <class B, WorkingCharacterSet W>
 
1743
void ParsingEngine<B, W>::Parse_AttlistDecl (){
 
1744
        
 
1745
        int old_abspos;
 
1746
        
 
1747
        int name_start;
 
1748
        int lgth;
 
1749
        
 
1750
        int elemID;
 
1751
        int attID;
 
1752
        
 
1753
        Advance(7); /* Skip "ATTLIST. */
 
1754
        requireWS();
 
1755
        
 
1756
        int nameID = Parse_Name();
 
1757
        elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
 
1758
        
 
1759
        old_abspos = AbsPos();
 
1760
        ScanTo(NonWS);
 
1761
        while(!AtChar<B::Base,'>'>(cur())) {
 
1762
                if(old_abspos == AbsPos())
 
1763
                Syntax_Error(NT_AttlistDecl);
 
1764
                
 
1765
                int att_nameID = Parse_Name();
 
1766
                
 
1767
                attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
1768
                if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
1769
        ATT_info * this_info = new ATT_info;
 
1770
        this_info->globalATT_id = attID;
 
1771
        requireWS();
 
1772
        if (at_CDATA<B::Base>(cur())){
 
1773
                Advance(5);
 
1774
                this_info->attType = CDATA_att;
 
1775
        }
 
1776
        else if(at_ID<B::Base>(cur())){
 
1777
                Advance(2);
 
1778
                this_info->attType = ID_att;
 
1779
        }
 
1780
        /* Make sure to check IDREFS before IDREF*/
 
1781
        else if(at_IDREFS<B::Base>(cur())){
 
1782
                Advance(6);
 
1783
                this_info->attType = IDREFS_att;
 
1784
        }
 
1785
        else if(at_IDREF<B::Base>(cur())){
 
1786
                Advance(5);
 
1787
                this_info->attType = IDREF_att;
 
1788
        }
 
1789
        else if(at_ENTITY<B::Base>(cur())){
 
1790
                Advance(6);
 
1791
                this_info->attType = ENTITY_att;
 
1792
        }
 
1793
        else if(at_ENTITIES<B::Base>(cur())){
 
1794
                Advance(8);
 
1795
                this_info->attType = ENTITIES_att;
 
1796
        }
 
1797
        /* Make sure to check NMTOKENS before NMTOKEN*/
 
1798
        else if(at_NMTOKENS<B::Base>(cur())){
 
1799
                Advance(8);
 
1800
                this_info->attType = NMTOKENS_att;
 
1801
        }
 
1802
        else if(at_NMTOKEN<B::Base>(cur())){
 
1803
                Advance(7);
 
1804
                this_info->attType = NMTOKEN_att;
 
1805
        }
 
1806
        else if(at_NOTATION<B::Base>(cur())){ /* NotationType = 'NOTATION' S Enumeration
 
1807
                                                                         when Nmtoken = Name */
 
1808
                Advance(8);
 
1809
                        requireWS();
 
1810
                Parse_Notation(this_info);
 
1811
                this_info->attType = NOTATION_att;
 
1812
        }
 
1813
        else if(AtChar<B::Base,'('>(cur())){
 
1814
                Parse_Enumeration(this_info);
 
1815
                this_info->attType = enumeration_att;
 
1816
        }
 
1817
        else
 
1818
                Syntax_Error(NT_AttlistDecl);
 
1819
        requireWS();
 
1820
        Parse_DefaultDecl(this_info);
 
1821
 
 
1822
                ScanTo(NonWS);
 
1823
                Parser_Interface<W>::model_info->ElementAttributeData[elemID].push_back(this_info);
 
1824
        }
 
1825
 
 
1826
        Advance(1);
 
1827
}
 
1828
 
 
1829
template <class B, WorkingCharacterSet W>
 
1830
void ParsingEngine<B, W>::Parse_Notation (ATT_info * this_info){
 
1831
 
 
1832
        if(AtChar<B::Base,'('>(cur()))
 
1833
                Advance(1);
 
1834
        else
 
1835
                Syntax_Error(NT_NotationType);
 
1836
        ScanTo(NonWS);
 
1837
        
 
1838
    int notn_nameID = Parse_Name();
 
1839
 
 
1840
        /*Notation name is not in the global table!*/
 
1841
        if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
 
1842
                Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
 
1843
        
 
1844
        ScanTo(NonWS);
 
1845
        while(AtChar<B::Base,'|'>(cur())){
 
1846
                Advance(1);
 
1847
                ScanTo(NonWS);  
 
1848
                notn_nameID = Parse_Name();
 
1849
                        
 
1850
                if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
 
1851
//                      Validity_Error(vErr_notatn);
 
1852
                        Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
 
1853
                        
 
1854
                ScanTo(NonWS);
 
1855
        }
 
1856
        if (AtChar<B::Base,')'>(cur())) 
 
1857
                Advance(1);
 
1858
        else
 
1859
                Syntax_Error(NT_NotationType);
 
1860
}
 
1861
 
 
1862
template <class B, WorkingCharacterSet W>
 
1863
void ParsingEngine<B, W>::Parse_Enumeration (ATT_info * this_info){
 
1864
 
 
1865
        int enumCount=0;
 
1866
        if(AtChar<B::Base,'('>(cur()))
 
1867
                Advance(1);
 
1868
        else
 
1869
                Syntax_Error(NT_Enumeration);
 
1870
        ScanTo(NonWS);
 
1871
        
 
1872
        int nmtokenID = Parse_Nmtoken();
 
1873
        
 
1874
        this_info->enumValues[nmtokenID]=++(enumCount);
 
1875
        
 
1876
        ScanTo(NonWS);
 
1877
        while(AtChar<B::Base,'|'>(cur())){
 
1878
                Advance(1);
 
1879
                ScanTo(NonWS);  
 
1880
                int nmtokenID = Parse_Nmtoken();
 
1881
        
 
1882
                int enumID = this_info->enumValues[nmtokenID];
 
1883
                if(enumID==0){  
 
1884
                        this_info->enumValues[nmtokenID]=++(enumCount);
 
1885
                        enumID = enumCount;
 
1886
                }
 
1887
                else if(!StrictWellFormedness){
 
1888
                        Validity_Error(vErr_NoDuplicateTokens);
 
1889
                }
 
1890
                ScanTo(NonWS);
 
1891
        }
 
1892
        if (AtChar<B::Base,')'>(cur())) 
 
1893
                Advance(1);
 
1894
        else
 
1895
                Syntax_Error(NT_Enumeration);
 
1896
}
 
1897
 
 
1898
template <class B, WorkingCharacterSet W>
 
1899
void ParsingEngine<B, W>::Parse_DefaultDecl (ATT_info * this_info){
 
1900
        if(at_REQUIRED<B::Base>(cur())){
 
1901
                Advance(9);
 
1902
                this_info->defaultKind = REQUIRED_att;
 
1903
        }
 
1904
        else if(at_IMPLIED<B::Base>(cur())){
 
1905
                Advance(8);
 
1906
                this_info->defaultKind = IMPLIED_att;
 
1907
        }
 
1908
        else {
 
1909
                if(at_FIXED<B::Base>(cur())){
 
1910
                        Advance(6);
 
1911
                        requireWS();
 
1912
                        this_info->defaultKind = FIXED_att;
 
1913
                }
 
1914
                else this_info->defaultKind = DEFAULT_att;
 
1915
                if(AtQuote<B::Base>(cur())){
 
1916
                        int quot_start = AbsPos()+1;
 
1917
                        Parse_AttValue();
 
1918
                        /* need to normalize */
 
1919
                        this_info->defaultValueLgth = AbsPos() - quot_start - 1;
 
1920
                        
 
1921
                        this_info->defaultValue = new unsigned char[this_info->defaultValueLgth+1];
 
1922
                        memcpy(this_info->defaultValue, GetCodeUnitPtr(quot_start),this_info->defaultValueLgth); 
 
1923
                        this_info->defaultValue[this_info->defaultValueLgth] = '\0';
 
1924
                        }
 
1925
                else
 
1926
                        Syntax_Error(NT_DefaultDecl);
 
1927
        }
 
1928
}
 
1929
 
 
1930
template <class B, WorkingCharacterSet W>
 
1931
void ParsingEngine<B, W>::Parse_Entitydecl (){
 
1932
        
 
1933
        int name_start;
 
1934
        int quot_start;
 
1935
        int lgth;
 
1936
        int old_abspos;
 
1937
        char * s;
 
1938
        
 
1939
        Advance(6); /* Skip "ENTITY. */
 
1940
        requireWS();
 
1941
        
 
1942
        if (AtChar<B::Base,'%'>(cur())){
 
1943
                Advance(1);
 
1944
                requireWS();
 
1945
                
 
1946
                int nameID = Parse_Name();
 
1947
                PEntity_info * this_info = new PEntity_info;
 
1948
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
 
1949
                if(entityID==0){        
 
1950
                        Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalPEntityCount);
 
1951
                        entityID = Parser_Interface<W>::model_info->globalPEntityCount;
 
1952
                        this_info->globalPEntity_id = entityID;
 
1953
                }
 
1954
                else
 
1955
                        printf("Warning: Entity definition already exist!\n");
 
1956
        
 
1957
                requireWS();
 
1958
                if(AtQuote<B::Base>(cur())){
 
1959
                Parse_PEntityValue(this_info);
 
1960
                this_info->is_external = false;
 
1961
        }
 
1962
        else {
 
1963
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
1964
                this_info->is_external = true;
 
1965
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
 
1966
        }
 
1967
        Parser_Interface<W>::model_info->PEntityData.push_back(this_info);
 
1968
        }
 
1969
        else{
 
1970
                int nameID = Parse_Name();
 
1971
        
 
1972
                GEntity_info * this_info = new GEntity_info();
 
1973
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
 
1974
                if(entityID==0){        
 
1975
                        Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalGEntityCount);
 
1976
                        entityID = Parser_Interface<W>::model_info->globalGEntityCount;
 
1977
                        this_info->globalGEntity_id = entityID;
 
1978
                }
 
1979
                else
 
1980
                        printf("Warning: Entity definition already exists!\n");
 
1981
                        
 
1982
                requireWS();
 
1983
                
 
1984
                if(AtQuote<B::Base>(cur())){
 
1985
                Parse_GEntityValue(this_info);                  
 
1986
                this_info->is_external = false;
 
1987
        }
 
1988
        else {
 
1989
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
1990
                this_info->is_external = true;
 
1991
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
 
1992
                        old_abspos = AbsPos();
 
1993
                        ScanTo(NonWS);
 
1994
                if(at_NDATA<B::Base>(cur())){
 
1995
                        if(old_abspos == AbsPos())
 
1996
                                Syntax_Error(NT_EntityDecl);
 
1997
                        else
 
1998
                                Advance(5);
 
1999
                        requireWS();
 
2000
                        name_start = AbsPos();
 
2001
                        int nameID = Parse_Name();
 
2002
                        lgth = AbsPos() - name_start;
 
2003
                                this_info->NDataName = copy_string(GetCodeUnitPtr(name_start),lgth);
 
2004
                }
 
2005
                }
 
2006
        Parser_Interface<W>::model_info->GEntityData.push_back(this_info);
 
2007
        }
 
2008
        ScanTo(NonWS);
 
2009
        if (AtChar<B::Base,'>'>(cur())){
 
2010
                Advance(1);
 
2011
        }
 
2012
        else
 
2013
                Syntax_Error(NT_EntityDecl);
 
2014
}
 
2015
 
 
2016
template <class B, WorkingCharacterSet W>
 
2017
void ParsingEngine<B, W>::Parse_Notationdecl (){
 
2018
 
 
2019
        int old_abspos;
 
2020
        Advance(8); /* Skip "NOTATION. */
 
2021
        requireWS();
 
2022
        
 
2023
        int nameID = Parse_Name();
 
2024
 
 
2025
        int notationID = Parser_Interface<W>::model_info->GlobalNotationTable[nameID];
 
2026
        /* notationID == -1: used but not yet defined; == 0: new, > 0 prev. defined */
 
2027
        if(notationID <= 0){    
 
2028
                Parser_Interface<W>::model_info->GlobalNotationTable[nameID]=++(Parser_Interface<W>::model_info->globalNotationCount);
 
2029
                notationID = Parser_Interface<W>::model_info->globalNotationCount;
 
2030
        }
 
2031
        else /*Duplicate notation name!*/
 
2032
                Validity_Error(vErr_NoDuplicateTokens);
 
2033
        Notation_info * this_info = new Notation_info;
 
2034
        ScanTo(NonWS);          
 
2035
    Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
 
2036
        ScanTo(NonWS);
 
2037
        if (AtChar<B::Base,'>'>(cur())) {
 
2038
                Advance(1);
 
2039
        }
 
2040
        else
 
2041
                Syntax_Error(NT_NotationDecl);
 
2042
}
 
2043
 
 
2044
template <class B, WorkingCharacterSet W>
 
2045
void ParsingEngine<B, W>::requireWS(){
 
2046
        
 
2047
    int old_abspos = AbsPos();  
 
2048
    ScanTo(NonWS);
 
2049
    if(old_abspos == AbsPos())
 
2050
        Syntax_Error(NT_S);
 
2051
}
 
2052
 
 
2053
template <class B, WorkingCharacterSet W>
 
2054
void ParsingEngine<B, W>::Parse_AttValue(){
 
2055
        
 
2056
        int     quoteCh = cur()[0];
 
2057
        Advance(1); /* Skip " or ' */
 
2058
 
 
2059
        ScanTo(Quote);                  
 
2060
        while (cur()[0] != quoteCh){
 
2061
                if (at_CharRef_Start<B::Base>(cur())){
 
2062
                        Parse_CharRef();
 
2063
                        ScanTo(Quote);
 
2064
                }
 
2065
                else if (AtChar<B::Base,'&'>(cur())){
 
2066
                        Parse_EntityRef();
 
2067
                        ScanTo(Quote);
 
2068
                }
 
2069
                else if (AtQuote<B::Base>(cur())) {
 
2070
                        Advance(1);
 
2071
                        ScanTo(Quote);
 
2072
                }
 
2073
                else /* if (AtChar<B::Base,'<'>(cur())) */
 
2074
                        WF_Error(wfErr_CleanAttrVals);
 
2075
        }
 
2076
        Advance(1);
 
2077
}
 
2078
 
 
2079
template <class B, WorkingCharacterSet W>
 
2080
void ParsingEngine<B, W>::Parse_GEntityValue(GEntity_info * this_info){
 
2081
        
 
2082
        int     quoteCh = cur()[0];
 
2083
        Advance(1); /* Skip " or ' */
 
2084
        this_info->is_simple = true;
 
2085
        int quot_start = AbsPos();
 
2086
        char * replText;
 
2087
        ScanTo(Quote);          
 
2088
        replText = copy_string(GetCodeUnitPtr(quot_start),AbsPos()-quot_start);
 
2089
        while (cur()[0] != quoteCh){
 
2090
                if (at_CharRef_Start<B::Base>(cur())){
 
2091
                        strcat (replText,Replace_CharRef());
 
2092
                        quot_start = AbsPos();
 
2093
                        ScanTo(Quote);
 
2094
                }
 
2095
                else if (AtQuote<B::Base>(cur())) {
 
2096
                        quot_start = AbsPos();
 
2097
                        Advance(1);
 
2098
                        ScanTo(Quote);
 
2099
                }
 
2100
                else if (at_EOF()) {
 
2101
                        Syntax_Error(NT_EntityValue);
 
2102
                }
 
2103
                else { /* '<' or '&' found */
 
2104
                        quot_start = AbsPos();
 
2105
                        Advance(1);
 
2106
                        ScanTo(Quote);
 
2107
                        this_info->is_simple = false;                   
 
2108
                }
 
2109
                replText = cat_string (replText,(char *)GetCodeUnitPtr(quot_start), strlen(replText), AbsPos()-quot_start);
 
2110
        }
 
2111
        this_info->ReplacementText = replText;
 
2112
        Advance(1);
 
2113
}
 
2114
 
 
2115
template <class B, WorkingCharacterSet W>
 
2116
char * ParsingEngine<B, W>::Replace_EntityRef(bool& is_simple){
 
2117
        Advance(1);
 
2118
        int nameID = Parse_Name(); 
 
2119
        if (AtChar<B::Base,';'>(cur()))
 
2120
                Advance(1);
 
2121
        else
 
2122
                Syntax_Error(NT_EntityValue);
 
2123
        int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
 
2124
        if (entityID == 0)
 
2125
                WF_Error(wfErr_wf_entdeclared);
 
2126
        else{
 
2127
                if (Parser_Interface<W>::model_info->GEntityData[entityID-1]->is_simple == false)
 
2128
                        is_simple = false;
 
2129
                return Parser_Interface<W>::model_info->GEntityData[entityID-1]->ReplacementText;
 
2130
        }
 
2131
        
 
2132
}
 
2133
 
 
2134
template <class B, WorkingCharacterSet W>
 
2135
void ParsingEngine<B, W>::Parse_PEntityValue(PEntity_info * this_info){
 
2136
        fprintf(stderr,"parsing of parameter entity value has not been completed yet.\n");
 
2137
        exit(-1);
 
2138
}
 
2139
 
 
2140
template <class B, WorkingCharacterSet W>
 
2141
char * ParsingEngine<B, W>::Replace_CharRef(){
 
2142
        Advance(2);
 
2143
        fprintf(stderr,"Replacement of Character Reference has not been completed yet.\n");
 
2144
        exit(-1);
 
2145
}
 
2146
 
 
2147
template <class B, WorkingCharacterSet W>
 
2148
void ParsingEngine<B, W>::Parse_Prolog(){
 
2149
        ScanTo(NonWS);
 
2150
        int old_pos = AbsPos();
 
2151
        while (!at_DOCTYPE_start<B::Base>(cur())) {             
 
2152
                text_or_markup_start = AbsPos();
 
2153
                if (at_Comment_Start<B::Base>(cur())) 
 
2154
                        Parse_Comment();
 
2155
                else if (at_PI_Start<B::Base>(cur()))
 
2156
                                Parse_PI();
 
2157
                else{
 
2158
                        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
 
2159
                        return;
 
2160
                }
 
2161
                ScanTo(NonWS);
 
2162
        }
 
2163
        Parse_DocType();
 
2164
        ScanTo(NonWS);
 
2165
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){                
 
2166
                text_or_markup_start = AbsPos();
 
2167
                if (at_Comment_Start<B::Base>(cur()))
 
2168
                        Parse_Comment();
 
2169
                else 
 
2170
                        Parse_PI();
 
2171
                ScanTo(NonWS);
 
2172
        }
 
2173
        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
 
2174
}
 
2175
 
 
2176
template <class B, WorkingCharacterSet W>
 
2177
void ParsingEngine<B, W>::Parse_ExtSubsetDecl() {
 
2178
        ScanTo(NonWS);
 
2179
        int start_pos=AbsPos();
 
2180
        while(!at_EOF()){
 
2181
                if(at_condSect_start<B::Base>(cur())){          
 
2182
                        Advance(3);
 
2183
                        ScanTo(NonWS);
 
2184
                        if (at_INCLUDE<B::Base>(cur())){
 
2185
                                Advance(7);
 
2186
                                ScanTo(NonWS);
 
2187
                                if(AtChar<B::Base,'['>(cur())){
 
2188
                                        Advance(1);
 
2189
                                        Parse_ExtSubsetDecl();
 
2190
                                        if(at_CDATA_End<B::Base>(cur()))
 
2191
                                                Advance(3);
 
2192
                                        else Syntax_Error(NT_includeSect);
 
2193
                                }
 
2194
                                else Syntax_Error(NT_includeSect);
 
2195
                        }
 
2196
                        else if (at_IGNORE<B::Base>(cur())){
 
2197
                                Advance(6);
 
2198
                                ScanTo(NonWS);          
 
2199
                                if(AtChar<B::Base,'['>(cur())){
 
2200
                                        int section_depth=1;
 
2201
                                        Advance(1);
 
2202
                                        while(!at_EOF()){
 
2203
                                                ScanTextTo(MarkupStart);
 
2204
                                                if(at_condSect_start<B::Base>(cur())){
 
2205
                                                        Advance(3);
 
2206
                                                        section_depth++;
 
2207
                                                }
 
2208
                                                else if(at_CDATA_End<B::Base>(cur())){
 
2209
                                                        Advance(3);
 
2210
                                                        section_depth--;
 
2211
                                                }
 
2212
                                                else
 
2213
                                                        Advance(1);
 
2214
                                                if(section_depth==0) return;                                    
 
2215
                                        }
 
2216
                                        Syntax_Error(NT_ignoreSectContents);    
 
2217
                                }
 
2218
                                else Syntax_Error(NT_ignoreSect);
 
2219
                        }
 
2220
                        else Syntax_Error(NT_conditionalSect);
 
2221
                }
 
2222
                else if (AtChar<B::Base,'%'>(cur()))
 
2223
                        Parse_PEReference();    
 
2224
                else if (at_PI_Start<B::Base>(cur())) {
 
2225
                        Parse_PI();
 
2226
                }
 
2227
                else if (at_Comment_Start<B::Base>(cur())) {
 
2228
                        Parse_Comment();
 
2229
                }
 
2230
                else if (AtChar<B::Base,'<'>(cur())){
 
2231
                        Advance(1);
 
2232
 
 
2233
                        if(AtChar<B::Base,'!'>(cur())){
 
2234
                                Advance(1);
 
2235
                                if(at_ELEMENT<B::Base>(cur()))
 
2236
                                        Parse_Elementdecl();
 
2237
                                else if(at_ATTLIST<B::Base>(cur()))
 
2238
                                        Parse_AttlistDecl();
 
2239
                                else if(at_ENTITY<B::Base>(cur()))
 
2240
                                        Parse_Entitydecl();
 
2241
                                else if(at_NOTATION<B::Base>(cur()))
 
2242
                                        Parse_Notationdecl();                                   
 
2243
                                else{
 
2244
                                        Syntax_Error(NT_markupdecl);    
 
2245
                                }                                                               
 
2246
                        }
 
2247
                        else
 
2248
                                Syntax_Error(NT_markupdecl); 
 
2249
                }
 
2250
                else
 
2251
                        Syntax_Error(NT_extSubsetDecl); 
 
2252
                ScanTo(NonWS);
 
2253
        }
 
2254
        ExtSubsetDecl_action(GetCodeUnitPtr(start_pos), LengthFrom(start_pos));
 
2255
}
 
2256
 
 
2257
/* Parse a valid start or empty element tag. */
 
2258
template <class B, WorkingCharacterSet W>
 
2259
inline int ParsingEngine<B, W>::Parse_ValidStartTag (bool& is_emptyStartTag){
 
2260
        int att_name_start;
 
2261
        int att_val_start;
 
2262
        int att_name_end, att_val_end;
 
2263
        unsigned char quoteCh;
 
2264
        Advance(1);
 
2265
 
 
2266
        int nameID = Parse_Name();  
 
2267
        int elemID = Parser_Interface<W>::model_info->GlobalElementTable[nameID];
 
2268
        if(elemID==0)
 
2269
                        Validity_Error(vErr_elementvalid);
 
2270
        
 
2271
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
 
2272
        /* The following test optimizes the most common case of a
 
2273
        start tag with no attributes.  */
 
2274
        if (AtChar<B::Base,'>'>(cur())) {
 
2275
                Advance(1);
 
2276
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2277
        }
 
2278
        else {
 
2279
                ScanTo(NonWS);
 
2280
                if (AtChar<B::Base,'>'>(cur())) {
 
2281
                        Advance(1);
 
2282
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2283
                }
 
2284
                else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2285
                        Advance(2);
 
2286
                        is_emptyStartTag = true;
 
2287
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2288
                }
 
2289
                else do {
 
2290
                        /* Must be an attribute-value pair or error. */
 
2291
                        att_name_start = AbsPos();
 
2292
                        int att_nameID = Parse_Name();
 
2293
                        #if (not defined(OMISSION)) or (OMISSION != ATTRIBUTE_UNIQUENESS) 
 
2294
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
 
2295
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
 
2296
                        else {
 
2297
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
 
2298
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
 
2299
                                        break;
 
2300
                                }                       
 
2301
                        }
 
2302
                        LastAttOccurrence[attID] = att_name_start;
 
2303
                        #endif
 
2304
                        /* The following optimized tests handle the frequently occurring 
 
2305
                        case that there are no blanks on either side of the equals sign.
 
2306
                        In many cases, the very first test handles 100% of actual
 
2307
                        attribute-value pairs encountered. */
 
2308
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
 
2309
                        else {
 
2310
                                ScanTo(NonWS);
 
2311
                                if (!AtChar<B::Base,'='>(cur())) {
 
2312
                                        Syntax_Error(NT_STag); 
 
2313
                                        break;
 
2314
                                }
 
2315
                                Advance(1); 
 
2316
                                ScanTo(NonWS);
 
2317
                                if (!AtQuote<B::Base>(cur())) {
 
2318
                                        Syntax_Error(NT_STag); 
 
2319
                                        break;
 
2320
                                }
 
2321
                        }
 
2322
                        att_val_start = AbsPos()+1;
 
2323
                        Parse_AttValue();
 
2324
                        att_val_end = AbsPos()-1;
 
2325
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
 
2326
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
2327
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
2328
                        }
 
2329
                        else {
 
2330
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
 
2331
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
 
2332
                        }
 
2333
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
 
2334
                        if (AtChar<B::Base,'>'>(cur())) {
 
2335
                                Advance(1);
 
2336
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2337
                                break;
 
2338
                        }
 
2339
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2340
                                Advance(2);
 
2341
                                is_emptyStartTag = true;        
 
2342
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2343
                                break;
 
2344
                        }
 
2345
                        ScanTo(NonWS);
 
2346
                        if (AtChar<B::Base,'>'>(cur())) {
 
2347
                                Advance(1);
 
2348
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2349
                                break;
 
2350
                        }
 
2351
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
 
2352
                                Advance(2);
 
2353
                                is_emptyStartTag = true;
 
2354
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
 
2355
                                break;
 
2356
                        }
 
2357
                        else if (AbsPos() == att_val_end + 1) { 
 
2358
                                /* No WS following att value */
 
2359
                                Syntax_Error(NT_STag);
 
2360
                                break;
 
2361
                        }
 
2362
                } while (1);
 
2363
        }
 
2364
        return nameID;
 
2365
}
 
2366
 
 
2367
template <class B, WorkingCharacterSet W>
 
2368
int ParsingEngine<B, W>::Parse_ValidElement() {
 
2369
        bool is_emptyStartTag = false;
 
2370
        int nameID = Parse_ValidStartTag(is_emptyStartTag);
 
2371
#ifdef DEBUG
 
2372
        printf("Parse_ValidElement: nameID = %d, name = %s, is_emptyStartTag=%i\n",nameID, Parser_Interface<W>::model_info->symbol_table->Get_UTF8_name(nameID), is_emptyStartTag);
 
2373
#endif
 
2374
        ContentModel * cm = Parser_Interface<W>::model_info->ContentModelData[nameID];
 
2375
        switch (cm->cm_type) {
 
2376
                case cm_Empty:
 
2377
                        if (!is_emptyStartTag) {
 
2378
                                if (at_EndTag_Start<B::Base>(cur())) {
 
2379
                                        Parse_WF_EndTag(nameID);
 
2380
                                }
 
2381
                                else {
 
2382
                                        Validity_Error(vErr_elementvalid);
 
2383
                                }
 
2384
                        }
 
2385
                        break;
 
2386
                case cm_Any:            
 
2387
                        if (!is_emptyStartTag) {
 
2388
                                Parse_AnyContent();
 
2389
                                Parse_WF_EndTag(nameID);
 
2390
                        }
 
2391
                        break;
 
2392
                case cm_Mixed:          
 
2393
                        if (!is_emptyStartTag) {
 
2394
                                Parse_MixedContent(((CM_Mixed *) cm)->elements);
 
2395
                                Parse_WF_EndTag(nameID);
 
2396
                        }
 
2397
                        break;
 
2398
                case cm_RegExp:
 
2399
                        CM_RegExp * cre = (CM_RegExp *) cm;
 
2400
                        int content_state = 0;
 
2401
                        if (!is_emptyStartTag) {
 
2402
                                Parse_ValidContent(cre, content_state);
 
2403
                                #ifdef DEBUG
 
2404
                                printf("Final content_state = %i, nameID = %i\n", content_state, nameID);
 
2405
                                #endif
 
2406
                                Parse_WF_EndTag(nameID);                
 
2407
                        }
 
2408
                        if (cre->transition_map[content_state][0]==0) {
 
2409
                                Validity_Error(vErr_elementvalid);
 
2410
                        }
 
2411
        }
 
2412
        return nameID;
 
2413
}
 
2414
 
 
2415
template <class B, WorkingCharacterSet W>
 
2416
void ParsingEngine<B, W>::Parse_ValidContent(CM_RegExp * cre, int & cur_state) {
 
2417
        do {
 
2418
                ScanTo(NonWS);
 
2419
                /* If non-null report WS  WS_action()? */
 
2420
                text_or_markup_start = AbsPos();
 
2421
                if (at_EndTag_Start<B::Base>(cur())) {
 
2422
                        break;
 
2423
                }
 
2424
                else if (at_ElementTag_Start<B::Base>(cur())) {
 
2425
                        int nameID = Parse_ValidElement();
 
2426
#ifdef DEBUG
 
2427
                        printf("Content model state transition %i", cur_state);
 
2428
#endif
 
2429
                        cur_state = cre->transition_map[cur_state][nameID];
 
2430
#ifdef DEBUG
 
2431
                        printf("-> %i\n", cur_state);
 
2432
#endif
 
2433
                }
 
2434
                else if (at_Comment_Start<B::Base>(cur())) {
 
2435
                        Parse_Comment();
 
2436
                }
 
2437
                else if (at_PI_Start<B::Base>(cur())) {
 
2438
                        Parse_PI();
 
2439
                }
 
2440
                else if (AtChar<B::Base,'&'>(cur())) {
 
2441
                        Parse_ValidEntityRef(cre, cur_state);
 
2442
#ifdef DEBUG
 
2443
                        printf("EntityRef complete, cur_state = %i\n", cur_state);
 
2444
#endif
 
2445
                        
 
2446
                }
 
2447
                else if (at_EOF()) {
 
2448
                        break;
 
2449
                }
 
2450
                else if (AtChar<B::Base,'<'>(cur())) {
 
2451
                        Syntax_Error(NT_markupdecl);
 
2452
                }
 
2453
                else {
 
2454
                        Validity_Error(vErr_elementvalid);
 
2455
                }
 
2456
        } while(1);
 
2457
}
 
2458
 
 
2459
 
 
2460
template <class B, WorkingCharacterSet W>
 
2461
void ParsingEngine<B, W>::Parse_AnyContent() {
 
2462
        do {
 
2463
                text_or_markup_start = AbsPos();
 
2464
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
2465
                if (at_ElementTag_Start<B::Base>(cur())) {
 
2466
                        text_if_nonnull_action(false);
 
2467
                        int nameID = Parse_ValidElement();
 
2468
                }
 
2469
                else if (at_EndTag_Start<B::Base>(cur())) {
 
2470
                        text_if_nonnull_action(false);
 
2471
                        return;
 
2472
                }
 
2473
                else if (at_Comment_Start<B::Base>(cur())) {
 
2474
                        text_if_nonnull_action(false);
 
2475
                        Parse_Comment();
 
2476
                }
 
2477
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2478
                        text_if_nonnull_action(true);
 
2479
                        Parse_CharRef();
 
2480
                }
 
2481
                else if (AtChar<B::Base,'&'>(cur())) {
 
2482
                        text_if_nonnull_action(true);
 
2483
                        Parse_EntityRef_inAnyContent();
 
2484
                }
 
2485
                else if (at_CDATA_Start<B::Base>(cur())) {
 
2486
                        text_if_nonnull_action(true);
 
2487
                        Parse_CDATA();
 
2488
                }
 
2489
                else if (at_PI_Start<B::Base>(cur())) {
 
2490
                        text_if_nonnull_action(false);
 
2491
                        Parse_PI();
 
2492
                }
 
2493
                else if (at_CDATA_End<B::Base>(cur())) {
 
2494
                        text_if_nonnull_action(true);
 
2495
                        Advance(3);
 
2496
                        Syntax_Error(NT_CharData);
 
2497
                }
 
2498
                else if (at_EOF()) {
 
2499
                        text_if_nonnull_action(false);
 
2500
                        return;
 
2501
                }
 
2502
                else if (AtChar<B::Base,'<'>(cur())) {
 
2503
                        Syntax_Error(NT_markupdecl);
 
2504
                }
 
2505
                else {
 
2506
                        Advance(1);
 
2507
                        continue;
 
2508
                }
 
2509
        } while (1);
 
2510
}
 
2511
template <class B, WorkingCharacterSet W>
 
2512
void ParsingEngine<B, W>::Parse_MixedContent(symbol_set_t elems) {
 
2513
        do {
 
2514
                text_or_markup_start = AbsPos();
 
2515
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
 
2516
/*              if (AtChar<B::Base,'<'>(cur())) {
 
2517
                        text_if_nonnull_action();
 
2518
                        Parse_Markup<B, W>();
 
2519
                }*/
 
2520
                if (at_ElementTag_Start<B::Base>(cur())) {
 
2521
                        text_if_nonnull_action(false);
 
2522
                        int nameID = Parse_ValidElement();
 
2523
                        if (elems[nameID] == 0) {
 
2524
                                Validity_Error(vErr_elementvalid);
 
2525
                        }
 
2526
                }
 
2527
                else if (at_EndTag_Start<B::Base>(cur())) {
 
2528
                        text_if_nonnull_action(false);
 
2529
                        return;
 
2530
                }
 
2531
                else if (at_Comment_Start<B::Base>(cur())) {
 
2532
                        text_if_nonnull_action(false);
 
2533
                        Parse_Comment();
 
2534
                }
 
2535
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2536
                        text_if_nonnull_action(true);
 
2537
                        Parse_CharRef();
 
2538
                }
 
2539
                else if (AtChar<B::Base,'&'>(cur())) {
 
2540
                        text_if_nonnull_action(true);
 
2541
                        Parse_EntityRef_inMixed(elems);
 
2542
                }
 
2543
                else if (at_CDATA_Start<B::Base>(cur())) {
 
2544
                        text_if_nonnull_action(true);
 
2545
                        Parse_CDATA();
 
2546
                }
 
2547
                else if (at_PI_Start<B::Base>(cur())) {
 
2548
                        text_if_nonnull_action(false);
 
2549
                        Parse_PI();
 
2550
                }
 
2551
                else if (at_CDATA_End<B::Base>(cur())) {
 
2552
                        text_if_nonnull_action(true);
 
2553
                        Advance(3);
 
2554
                        Syntax_Error(NT_CharData);
 
2555
                }
 
2556
                else if (at_EOF()) {
 
2557
                        text_if_nonnull_action(false);
 
2558
                        return;
 
2559
                }
 
2560
                else if (AtChar<B::Base,'<'>(cur())) {
 
2561
                        Syntax_Error(NT_markupdecl);
 
2562
                }
 
2563
                else {
 
2564
                        Advance(1);
 
2565
                        continue;
 
2566
                }
 
2567
        } while (1);
 
2568
}
 
2569
 
 
2570
 
 
2571
template <class B, WorkingCharacterSet W>
 
2572
int ParsingEngine<B, W>::Parse_Name() {
 
2573
        int name_pos = AbsPos();
 
2574
        ScanTo(NameFollow);
 
2575
        int lgth = AbsPos()-name_pos;
 
2576
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Name(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
 
2577
        if (nameID != 0) return nameID;
 
2578
        else {
 
2579
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2580
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2581
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2582
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved();
 
2583
        }
 
2584
}
 
2585
 
 
2586
// template <>
 
2587
// int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Name() {
 
2588
//      int name_pos = AbsPos();
 
2589
//      ScanTo(NameFollow);
 
2590
//      int lgth = AbsPos()-name_pos;
 
2591
// //   int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
 
2592
// //   if (nameID != 0) return nameID;
 
2593
// //   else {
 
2594
//              int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2595
//              char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2596
//              byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2597
//              return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved();
 
2598
// //   }
 
2599
// }
 
2600
 
 
2601
// template <WorkingCharacterSet W>
 
2602
// inline int ParsingEngine<UTF8_Buffer, W>::Parse_Name() {
 
2603
//      int name_pos = AbsPos();
 
2604
//      ScanTo(NameFollow);
 
2605
//      int lgth = AbsPos()-name_pos;
 
2606
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2607
// }
 
2608
 
 
2609
template <>
 
2610
inline int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Name() {
 
2611
        int name_pos = AbsPos();
 
2612
        ScanTo(NameFollow);
 
2613
        int lgth = AbsPos()-name_pos;
 
2614
        return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2615
}
 
2616
 
 
2617
template <class B, WorkingCharacterSet W>
 
2618
int ParsingEngine<B, W>::Parse_Nmtoken() {
 
2619
        int name_pos = AbsPos();
 
2620
        ScanTo(NameFollow);
 
2621
        int lgth = AbsPos()-name_pos;
 
2622
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Nmtoken(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
 
2623
        if (nameID != 0) return nameID;
 
2624
        else {
 
2625
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2626
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2627
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2628
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
 
2629
        }
 
2630
}
 
2631
 
 
2632
/*template <>
 
2633
int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Nmtoken() {
 
2634
        int name_pos = AbsPos();
 
2635
        ScanTo(NameFollow);
 
2636
        int lgth = AbsPos()-name_pos;
 
2637
//      int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
 
2638
//      if (nameID != 0) return nameID;
 
2639
//      else {
 
2640
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
 
2641
                char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
 
2642
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
 
2643
                return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
 
2644
//      }
 
2645
}*/
 
2646
// template <>
 
2647
// int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Nmtoken() {
 
2648
//      int name_pos = AbsPos();
 
2649
//      ScanTo(NameFollow);
 
2650
//      int lgth = AbsPos()-name_pos;
 
2651
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Nmtoken(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
 
2652
// }
 
2653
 
 
2654
template <class B, WorkingCharacterSet W>
 
2655
void ParsingEngine<B, W>::Parse_DocumentContent() {
 
2656
#if (VALIDATION_MODE == ON)
 
2657
        int cur_state = 0;
 
2658
        Parse_ValidContent(Parser_Interface<W>::model_info->rootModel, cur_state);
 
2659
        if (Parser_Interface<W>::model_info->rootModel->transition_map[cur_state][0]==0) {
 
2660
                Validity_Error(vErr_elementvalid);
 
2661
        }
 
2662
#endif
 
2663
#if (VALIDATION_MODE == OFF)
 
2664
        Parse_WF_Element();
 
2665
        ScanTo(NonWS);
 
2666
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
 
2667
                if (at_Comment_Start<B::Base>(cur()))
 
2668
                        Parse_Comment();
 
2669
                else 
 
2670
                        Parse_PI();
 
2671
                ScanTo(NonWS);
 
2672
        }
 
2673
        if (!at_EOF()) {
 
2674
                Syntax_Error(NT_document);
 
2675
        }       
 
2676
#endif
 
2677
        Parser_Interface<W>::DocumentEnd_action();      
 
2678
}
 
2679
 
 
2680
#ifdef MARKUP_PASS_CONTROL
 
2681
// Test routine as an alternative to MarkupPass.
 
2682
template <class B, WorkingCharacterSet W>
 
2683
void ParsingEngine<B, W>::ParseContent() {
 
2684
        int start_code = 0;
 
2685
        int end_code = 0;
 
2686
        int charref_code = 0;
 
2687
        int general_ref_code = 0;
 
2688
        DocumentStart_action(); 
 
2689
        bool is_emptyStartTag = false;
 
2690
        do {
 
2691
                text_or_markup_start = AbsPos();
 
2692
                ScanTo(MarkupStart); /* '<', '&', or ']' for 0b11']]>' test */
 
2693
/*              if (AtChar<B::Base,'<'>(cur())) {
 
2694
                        text_if_nonnull_action();
 
2695
                        Parse_Markup<B, W>();
 
2696
                }*/
 
2697
                if (at_EndTag_Start<B::Base>(cur())) {
 
2698
                        end_code |= AbsPos();
 
2699
                }
 
2700
                else if (AtChar<B::Base,'<'>(cur())) {
 
2701
                        start_code += AbsPos();
 
2702
                }
 
2703
                else if (at_CharRef_Start<B::Base>(cur())) {
 
2704
                        charref_code += 1;
 
2705
                }
 
2706
                else  if (AtChar<B::Base,'&'>(cur())) {
 
2707
                        general_ref_code += 1;
 
2708
                }
 
2709
                else if (at_EOF()) break;
 
2710
                Advance(1);
 
2711
        } while (1);
 
2712
        printf("Start_code: %i\n", start_code);
 
2713
        printf("End_code: %i\n", end_code);
 
2714
        printf("general_ref_code: %i\n", general_ref_code);
 
2715
        printf("charref_code: %i\n", charref_code);
 
2716
        DocumentEnd_action();   
 
2717
}
 
2718
#endif
 
2719
 
 
2720
#ifdef MARKUP_SORTING
 
2721
// Little endian codes for [&#/] stream.
 
2722
enum MarkupSortCodes {
 
2723
  StartTagTwoBitCode = 0,
 
2724
  EndTagTwoBitCode = 2,
 
2725
  GeneralRefCode = 1,
 
2726
  CharRefCode = 3
 
2727
};
 
2728
 
 
2729
 
 
2730
static inline int GetBitPair(SIMD_type * stream, int bit_posn) {
 
2731
        return bitstream_segment_from(stream, bit_posn) & 3;
 
2732
}
 
2733
 
 
2734
template <class B, WorkingCharacterSet W>
 
2735
void ParsingEngine<B, W>::ParseContent() {
 
2736
/*vector<int> MarkupPositions[4];*/
 
2737
int MarkupPositions[4][BUFFER_SIZE];
 
2738
int MarkupCounts[4];
 
2739
        int start_code = 0;
 
2740
        int end_code = 0;
 
2741
        int charref_code = 0;
 
2742
        int general_ref_code = 0;
 
2743
 
 
2744
        DocumentStart_action(); 
 
2745
        bool is_emptyStartTag = false;
 
2746
                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
 
2747
                text_or_markup_start = AbsPos();
 
2748
        do {
 
2749
                unsigned long segment = bitstream_segment_from(buf->item_stream[MarkupStart], buffer_rel_pos);
 
2750
//printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
 
2751
                if (segment != 0) {
 
2752
                        buffer_rel_pos += cfzl(segment);
 
2753
                text_or_markup_start = AbsPos();
 
2754
                        int markup_code = GetBitPair(buf->item_stream[AmpHashSlash], buffer_rel_pos);
 
2755
                        MarkupPositions[markup_code][MarkupCounts[markup_code]] = AbsPos();
 
2756
                        MarkupCounts[markup_code]++;
 
2757
                        Advance(1);
 
2758
                }
 
2759
                else {
 
2760
                        buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
 
2761
// printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
 
2762
 
 
2763
                        if (buffer_rel_pos >= buffer_limit_pos) {
 
2764
/*                              for (int i = 0; i < MarkupCounts[StartTagTwoBitCode]; i++) {
 
2765
                                        start_code += MarkupPositions[StartTagTwoBitCode][i];
 
2766
                                }
 
2767
                                for (int i = 0; i < MarkupCounts[EndTagTwoBitCode]; i++) {
 
2768
                                        end_code |= MarkupPositions[EndTagTwoBitCode][i];
 
2769
                                }
 
2770
                                for (int i = 0; i < MarkupCounts[GeneralRefCode]; i++) {
 
2771
                                        general_ref_code += 1;
 
2772
                                }
 
2773
                                for (int i = 0; i < MarkupCounts[CharRefCode]; i++) {
 
2774
                                        charref_code += 1;
 
2775
                                }*/
 
2776
/*      printf("Start_code: %i\n", start_code);
 
2777
        printf("End_code: %i\n", end_code);
 
2778
        printf("general_ref_code: %i\n", general_ref_code);
 
2779
        printf("charref_code: %i\n", charref_code);*/
 
2780
                                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
 
2781
                                if (buffer_rel_pos >= BUFFER_SIZE) {
 
2782
                                        AdjustBufferEndForIncompleteSequences();
 
2783
                                        Parser_Interface<W>::FinalizeBuffer_action();
 
2784
                                        AdvanceBuffers();
 
2785
                                }
 
2786
                                else break;
 
2787
                        }
 
2788
                        
 
2789
                }
 
2790
 
 
2791
        } while (1);
 
2792
/*      vector<int>::iterator i;
 
2793
        for (i = MarkupPositions[StartTagTwoBitCode].begin(); i != MarkupPositions[StartTagTwoBitCode].end(); i++) {
 
2794
                start_code += *i;
 
2795
        }
 
2796
        for (i = MarkupPositions[EndTagTwoBitCode].begin(); i != MarkupPositions[EndTagTwoBitCode].end(); i++) {
 
2797
                end_code |= *i;
 
2798
        }
 
2799
        for (i = MarkupPositions[GeneralRefCode].begin(); i != MarkupPositions[GeneralRefCode].end(); i++) {
 
2800
                general_ref_code += 1;
 
2801
        }
 
2802
        for (i = MarkupPositions[CharRefCode].begin(); i != MarkupPositions[CharRefCode].end(); i++) {
 
2803
                charref_code += 1;
 
2804
        }*/
 
2805
        printf("Start_code: %i\n", start_code);
 
2806
        printf("End_code: %i\n", end_code);
 
2807
        printf("general_ref_code: %i\n", general_ref_code);
 
2808
        printf("charref_code: %i\n", charref_code);
 
2809
        DocumentEnd_action();   
 
2810
}
 
2811
 
 
2812
#endif
 
2813
 
 
2814
 
 
2815
 
 
2816