/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090211/src/engine.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-09-23 17:13:04 UTC
  • Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd
Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*  engine.c - Parabix XML parsing engine.
2
 
    Copyright (c) 2007, 2008, Robert D. Cameron and Dan Lin.
3
 
    Licensed to the public under the Open Software License 3.0.
4
 
    Licensed to International Characters, Inc., under the Academic
5
 
    Free License 3.0.
6
 
*/
7
 
 
8
 
#include "engine.h"
9
 
#include "byteplex.h"
10
 
#include "xmldecl.h"
11
 
#include "bytelex.h"
12
 
#include "bitlex.h"
13
 
#include "contentmodel.h"
14
 
#include "contentmodel.c"
15
 
#include "xml_error.h"
16
 
 
17
 
#include <assert.h>
18
 
#include <stdlib.h>
19
 
#include <errno.h>
20
 
#include <string.h>
21
 
#include <string>
22
 
#include <iostream>
23
 
using namespace std;
24
 
        
25
 
inline char * copy_string (unsigned char * s, int lgth){                
26
 
        char * d = new char[lgth+1];
27
 
        memcpy(d, (char *)s,lgth); 
28
 
        d[lgth] = '\0'; 
29
 
        return d;
30
 
}
31
 
 
32
 
inline char * cat_string (char * s1, char * s2, int lgth1, int lgth2){
33
 
        char * s = new char[lgth1 + lgth2 + 1];
34
 
        memcpy(s, s1,lgth1);
35
 
        memcpy(&s[lgth1],s2,lgth2);
36
 
        s[lgth1 + lgth2] = '\0';
37
 
        return s;
38
 
}
39
 
        
40
 
        
41
 
template <WorkingCharacterSet W>        
42
 
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename) {
43
 
        
44
 
        int chars_read;
45
 
        unsigned char signature[4];
46
 
        FILE * infile;
47
 
        infile = fopen(filename, "rb");
48
 
        if (!infile) {
49
 
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
50
 
                exit(-1);
51
 
        }
52
 
        fread(signature,1,4,infile);
53
 
        Entity_Info * e = new Entity_Info;
54
 
        Model_Info * m = new Model_Info;
55
 
        e->AnalyzeSignature(signature);
56
 
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
57
 
        b->InitializeBuffer(signature,4);
58
 
        b->DoByteplex();
59
 
        b->PreparePseudoASCII_Stream();
60
 
        
61
 
        if (e->code_unit_base == ASCII) {
62
 
                XML_Decl_Parser<ASCII> decl_parser(b);
63
 
                decl_parser.ReadXMLInfo(*e);
64
 
                if (e->code_unit_size == SingleByte) {
65
 
                        if (!(e->has_encoding_decl) || at_UTF_8(e->encoding))
66
 
                                return new ParsingEngine< UTF8_Buffer, W>(e, m, b, false);              
67
 
                        else return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
68
 
                }
69
 
                else if (e->code_unit_size == DoubleByte) {
70
 
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
71
 
                }
72
 
                else if (e->code_unit_size == QuadByte) {
73
 
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
74
 
                }
75
 
        }
76
 
        else /* if (e->code_unit_base == EBCDIC) */ {
77
 
                XML_Decl_Parser<EBCDIC> decl_parser(b);
78
 
                decl_parser.ReadXMLInfo(*e);
79
 
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
80
 
        }       
81
 
}
82
 
 
83
 
template <WorkingCharacterSet W>
84
 
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * filename, Model_Info * m) {
85
 
        
86
 
        int chars_read;
87
 
        unsigned char signature[4];
88
 
        FILE * infile;
89
 
        infile = fopen(filename, "rb");
90
 
        if (!infile) {
91
 
                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
92
 
                exit(-1);
93
 
        }
94
 
        fread(signature,1,4,infile);
95
 
        Entity_Info * e = new Entity_Info;
96
 
        e->AnalyzeSignature(signature);
97
 
        Byteplex * b = Byteplex::ByteplexFactory(e, infile);
98
 
        b->InitializeBuffer(signature,4);
99
 
        b->DoByteplex();
100
 
        b->PreparePseudoASCII_Stream();
101
 
        if (e->code_unit_base == ASCII) {
102
 
                XML_Decl_Parser<ASCII> decl_parser(b);
103
 
                decl_parser.ReadXMLInfo(*e);
104
 
                if (e->code_unit_size == SingleByte) {
105
 
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, true);
106
 
                }
107
 
                else if (e->code_unit_size == DoubleByte) {
108
 
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, true);
109
 
                }
110
 
                else if (e->code_unit_size == QuadByte) {
111
 
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, true);
112
 
                }
113
 
        }
114
 
        else /* if (e->code_unit_base == EBCDIC) */ {
115
 
                XML_Decl_Parser<EBCDIC> decl_parser(b);
116
 
                decl_parser.ReadXMLInfo(*e);
117
 
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, true);
118
 
        }       
119
 
}
120
 
 
121
 
template <WorkingCharacterSet W>
122
 
Parser_Interface<W> * Parser_Interface<W>::ParserFactory(char * byte_buffer, int byte_count, Entity_Info * e1, Model_Info * m){
123
 
        Entity_Info * e = new Entity_Info;
124
 
        e->BOM_units = 0;
125
 
        e->code_unit_base=e1->code_unit_base;
126
 
        e->code_unit_size=e1->code_unit_size;
127
 
        e->version=e1->version;
128
 
        e->encoding=e1->encoding;
129
 
        e->content_start = 0;
130
 
        Byteplex * b = Byteplex::ByteplexFactory(e, (unsigned char *) byte_buffer, byte_count);
131
 
        b->DoByteplex();
132
 
        b->PreparePseudoASCII_Stream();
133
 
        if (e->code_unit_base == ASCII) {
134
 
                XML_Decl_Parser<ASCII> decl_parser(b);
135
 
                decl_parser.ReadXMLInfo(*e);
136
 
                if (e->code_unit_size == SingleByte) {
137
 
                    puts("ASCII");
138
 
                        return new ParsingEngine< X8_Buffer<ASCII>, W>(e, m, b, false);
139
 
                }
140
 
                else if (e->code_unit_size == DoubleByte) {
141
 
                        return new ParsingEngine<U16_Buffer, W>(e, m, b, false);
142
 
                }
143
 
                else if (e->code_unit_size == QuadByte) {
144
 
                        return new ParsingEngine<U32_Buffer, W>(e, m, b, false);
145
 
                }
146
 
        }
147
 
        else /* if (e->code_unit_base == EBCDIC) */ {
148
 
                return new ParsingEngine< X8_Buffer<EBCDIC>, W>(e, m, b, false);
149
 
        }       
150
 
}
151
 
 
152
 
template <WorkingCharacterSet W>
153
 
Parser_Interface<W>::~Parser_Interface() {
154
 
}
155
 
 
156
 
 
157
 
template <WorkingCharacterSet W>
158
 
bool Parser_Interface<W>::has_ByteOrderMark() {
159
 
        return entity_Info->BOM_units > 0;
160
 
}
161
 
 
162
 
template <WorkingCharacterSet W>
163
 
XML_version Parser_Interface<W>::get_version() {
164
 
        return entity_Info->version;
165
 
}
166
 
 
167
 
template <WorkingCharacterSet W>
168
 
XML_standalone Parser_Interface<W>::standalone_status() {
169
 
        return entity_Info->standalone;
170
 
}
171
 
 
172
 
template <WorkingCharacterSet W>
173
 
bool Parser_Interface<W>::has_EncodingDecl() {
174
 
        return entity_Info->has_encoding_decl;
175
 
}
176
 
 
177
 
template <WorkingCharacterSet W>
178
 
unsigned char * Parser_Interface<W>::get_Encoding() {
179
 
        return entity_Info->encoding;
180
 
}
181
 
 
182
 
template <class B, WorkingCharacterSet W>
183
 
inline unsigned char * ParsingEngine<B, W>::GetCodeUnitPtr(int pos) {
184
 
        int rel_pos = pos - buffer_base_pos;
185
 
        return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) B::Size];
186
 
}
187
 
 
188
 
template <>
189
 
inline unsigned char * ParsingEngine<UTF8_Buffer, UTF_8>::GetCodeUnitPtr(int pos) {
190
 
        int rel_pos = pos - buffer_base_pos;
191
 
        return &((unsigned char *) (x8data))[rel_pos];
192
 
}
193
 
 
194
 
 
195
 
 
196
 
 
197
 
template <class B, WorkingCharacterSet W>
198
 
ParsingEngine<B, W>::ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external) : Parser_Interface<W> () {
199
 
        Parser_Interface<W>::entity_Info = e;
200
 
        Parser_Interface<W>::model_info = m;
201
 
        byteplex = b;
202
 
 
203
 
//      m->symbol_table = new Symbol_Table();
204
 
//      m->SimpleEntity("lt", "<"); 
205
 
//      m->SimpleEntity("gt", ">"); 
206
 
//      m->SimpleEntity("amp", "&"); 
207
 
//      m->SimpleEntity("quot", "\""); 
208
 
//      m->SimpleEntity("apos", "'");   
209
 
        m->symbol_table->version = e->version;
210
 
 
211
 
        StrictWellFormedness=false;
212
 
        LastAttOccurrence.assign(m->globalAttributeCount+1, 0);
213
 
        
214
 
        
215
 
        bitplex = new Bitplex;
216
 
        buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
217
 
 
218
 
  /* Install sentinels for every lexical item stream*/
219
 
#ifdef TEMPLATED_SIMD_LIB
220
 
        BitBlock sentinel_value = simd<1>::constant<1>();
221
 
#endif
222
 
#ifndef TEMPLATED_SIMD_LIB
223
 
        BitBlock sentinel_value = simd_const_1(1);
224
 
#endif
225
 
 
226
 
#ifdef OPTIMIZE_SHORT_SCAN
227
 
        sentinel_value = sisd_sfli(sentinel_value, 8*sizeof(unsigned long));
228
 
#endif
229
 
 
230
 
        for (int j = minLexicalItem; j < LexicalItemCount; j++) {
231
 
                buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
232
 
        }
233
 
 
234
 
        buffer_base_pos = 0;
235
 
        buffer_rel_pos = e->content_start;
236
 
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
237
 
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
238
 
        x8data = byteplex->x8data;
239
 
        lexer = Lexer<B::Base>::LexerFactory(e, buf);
240
 
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
241
 
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
242
 
}
243
 
 
244
 
template <class B, WorkingCharacterSet W>
245
 
ParsingEngine<B, W>::~ParsingEngine() {
246
 
  // How do we do this?  Parser_Interface<W>::model_info->~Model_Info();
247
 
  Parser_Interface<W>::entity_Info->~Entity_Info();
248
 
  byteplex->~Byteplex();
249
 
  bitplex->~Bitplex();
250
 
  simd_delete((SIMD_type *) buf);
251
 
  lexer->~Lexer_Interface();
252
 
}
253
 
 
254
 
template <class B, WorkingCharacterSet W>
255
 
void ParsingEngine<B, W>::AdvanceBuffers(){
256
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
257
 
        code_clocker->cc_start_interval();
258
 
#endif
259
 
 
260
 
        int advance_amt = text_or_markup_start - buffer_base_pos;
261
 
        advance_amt &= -PACKSIZE; // maintain alignment
262
 
        byteplex->AdvanceInputBuffer(advance_amt);
263
 
        buffer_base_pos += advance_amt;
264
 
        buffer_rel_pos -= advance_amt;
265
 
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
266
 
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
267
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
268
 
        code_clocker->cc_start_interval();
269
 
#endif
270
 
        byteplex->DoByteplex();
271
 
        byteplex->PreparePseudoASCII_Stream();
272
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
273
 
        code_clocker->cc_end_interval(buffer_limit_pos);
274
 
#endif
275
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
276
 
        code_clocker->cc_start_interval();
277
 
#endif
278
 
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
279
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
280
 
        code_clocker->cc_end_interval(buffer_limit_pos);
281
 
#endif
282
 
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
283
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
284
 
        code_clocker->cc_end_interval(buffer_limit_pos);
285
 
#endif
286
 
 
287
 
}
288
 
 
289
 
template <>
290
 
void ParsingEngine<U16_Buffer, UTF_16>::AdvanceBuffers(){
291
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
292
 
        code_clocker->cc_start_interval();
293
 
#endif
294
 
 
295
 
        int advance_amt = text_or_markup_start - buffer_base_pos;
296
 
        advance_amt &= -PACKSIZE; // maintain alignment
297
 
        byteplex->AdvanceInputBuffer(advance_amt);
298
 
        buffer_base_pos += advance_amt;
299
 
        buffer_rel_pos -= advance_amt;
300
 
        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
301
 
        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
302
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
303
 
        code_clocker->cc_start_interval();
304
 
#endif
305
 
        byteplex->DoByteplex();
306
 
        if (at_UTF_16(Parser_Interface<UTF_16>::entity_Info->encoding)) ((U16_Buffer *) byteplex)->Validate_UTF16();
307
 
        byteplex->PreparePseudoASCII_Stream();
308
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BYTEPLEX)
309
 
        code_clocker->cc_end_interval(buffer_limit_pos);
310
 
#endif
311
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
312
 
        code_clocker->cc_start_interval();
313
 
#endif
314
 
        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
315
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITPLEX)
316
 
        code_clocker->cc_end_interval(buffer_limit_pos);
317
 
#endif
318
 
        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
319
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == ADVANCE_BUFFERS)
320
 
        code_clocker->cc_end_interval(buffer_limit_pos);
321
 
#endif
322
 
 
323
 
}
324
 
 
325
 
template <class B, WorkingCharacterSet W>
326
 
inline unsigned char * ParsingEngine<B, W>::cur() const {
327
 
  return &((unsigned char *) x8data)[buffer_rel_pos];
328
 
}
329
 
 
330
 
template <class B, WorkingCharacterSet W>
331
 
inline int ParsingEngine<B, W>::AbsPos() const {
332
 
  return buffer_base_pos + buffer_rel_pos;
333
 
}
334
 
 
335
 
template <class B, WorkingCharacterSet W>
336
 
inline int ParsingEngine<B, W>::LengthFrom(int start_pos) const {
337
 
  return buffer_base_pos + buffer_rel_pos - start_pos;
338
 
}
339
 
 
340
 
 
341
 
 
342
 
template <class B, WorkingCharacterSet W>
343
 
inline int ParsingEngine<B, W>::BufferRelPos() const {
344
 
  return buffer_rel_pos;
345
 
}
346
 
 
347
 
 
348
 
template <class B, WorkingCharacterSet W>
349
 
inline bool ParsingEngine<B, W>::at_EOF() const {
350
 
  return (buffer_rel_pos >= buffer_limit_pos) && 
351
 
         (buffer_limit_pos < BUFFER_SIZE);
352
 
}
353
 
 
354
 
//template <class B, WorkingCharacterSet W>
355
 
//inline void ParsingEngine<B, W>::Advance(int n) {
356
 
//      buffer_rel_pos += n;
357
 
//  if (buffer_rel_pos >= BUFFER_SIZE) {        
358
 
//      Parser_Interface<W>::FinalizeBuffer_action();
359
 
//      AdvanceBuffers();
360
 
//  }
361
 
//}
362
 
 
363
 
#define Advance(n) \
364
 
do {\
365
 
        buffer_rel_pos += n; \
366
 
        if (buffer_rel_pos >= BUFFER_SIZE) {    \
367
 
                Parser_Interface<W>::FinalizeBuffer_action();\
368
 
        AdvanceBuffers();\
369
 
        }\
370
 
} while(0)
371
 
 
372
 
 
373
 
template <class B, WorkingCharacterSet W> 
374
 
void ParsingEngine<B, W>::AdjustBufferEndForIncompleteSequences() {
375
 
}
376
 
 
377
 
template <> 
378
 
void ParsingEngine<UTF8_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
379
 
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
380
 
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
381
 
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
382
 
}
383
 
 
384
 
template <> 
385
 
void ParsingEngine<U16_Buffer, UTF_8>::AdjustBufferEndForIncompleteSequences() {
386
 
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
387
 
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
388
 
}
389
 
 
390
 
template <> 
391
 
void ParsingEngine<UTF8_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
392
 
        if (*(cur()-1) >= 0xC0) buffer_rel_pos--;
393
 
        else if (*(cur()-2) >= 0xE0) buffer_rel_pos -= 2;
394
 
        else if (*(cur()-3) >= 0xF0) buffer_rel_pos -= 3;
395
 
}
396
 
 
397
 
template <> 
398
 
void ParsingEngine<U16_Buffer, UTF_16>::AdjustBufferEndForIncompleteSequences() {
399
 
        unsigned short last_u16_unit = *(GetCodeUnitPtr(AbsPos()-1));
400
 
        if ((last_u16_unit >= 0xD800) & (last_u16_unit <= 0xDC00)) buffer_rel_pos--;
401
 
}
402
 
 
403
 
 
404
 
 
405
 
#ifdef OPTIMIZE_SHORT_SCAN
406
 
//
407
 
//  Inline ScanTo with unrolled first test that should almost always 
408
 
//  succeed for short scans.
409
 
#define ScanTo(item) \
410
 
do {\
411
 
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
412
 
        if (segment != 0) buffer_rel_pos += cfzl(segment);\
413
 
        else {\
414
 
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
415
 
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
416
 
                while (buffer_rel_pos >= BUFFER_SIZE) {\
417
 
                        buffer_rel_pos = BUFFER_SIZE;\
418
 
                        AdjustBufferEndForIncompleteSequences();\
419
 
                        Parser_Interface<W>::FinalizeBuffer_action();\
420
 
                        AdvanceBuffers();\
421
 
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
422
 
                }\
423
 
        }\
424
 
} while(0)
425
 
 
426
 
// The following version seems cleaner, but measured mispredictions are higher
427
 
// #define ScanTo(item) \
428
 
// do {\
429
 
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
430
 
//      while (unlikely (segment == 0)) {\
431
 
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
432
 
//              if (buffer_rel_pos >= BUFFER_SIZE) {\
433
 
//                      buffer_rel_pos = BUFFER_SIZE;\
434
 
//                      AdjustBufferEndForIncompleteSequences();\
435
 
//                      Parser_Interface<W>::FinalizeBuffer_action();\
436
 
//                      AdvanceBuffers();\
437
 
//              }\
438
 
//              segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
439
 
//      }\
440
 
//      buffer_rel_pos += cfzl(segment);\
441
 
// } while(0)
442
 
// 
443
 
// #define ScanTextTo(item) \
444
 
// do {\
445
 
//      unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);\
446
 
//      text_or_markup_start = AbsPos();\
447
 
//      if (segment != 0) buffer_rel_pos += cfzl(segment);\
448
 
//      else {\
449
 
//              buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);\
450
 
//              buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
451
 
//              while (buffer_rel_pos >= BUFFER_SIZE) {\
452
 
//                      buffer_rel_pos = BUFFER_SIZE;\
453
 
//                      AdjustBufferEndForIncompleteSequences();\
454
 
//                      Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);\
455
 
//                      text_or_markup_start = AbsPos();\
456
 
//                      Parser_Interface<W>::FinalizeBuffer_action();\
457
 
//                      AdvanceBuffers();\
458
 
//                      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
459
 
//              }\
460
 
//      }\
461
 
// } while(0)
462
 
 
463
 
template <class B, WorkingCharacterSet W>
464
 
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
465
 
        text_or_markup_start = AbsPos();
466
 
        unsigned long segment = bitstream_segment_from(buf->item_stream[item], buffer_rel_pos);
467
 
        if (segment != 0) buffer_rel_pos += cfzl(segment);
468
 
        else {
469
 
                buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
470
 
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
471
 
                while (buffer_rel_pos >= BUFFER_SIZE) {
472
 
                        buffer_rel_pos = BUFFER_SIZE;
473
 
                        AdjustBufferEndForIncompleteSequences();
474
 
                        Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
475
 
                        text_or_markup_start = AbsPos();
476
 
                        Parser_Interface<W>::FinalizeBuffer_action();
477
 
                        AdvanceBuffers();
478
 
                        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
479
 
                }
480
 
        }
481
 
}
482
 
 
483
 
#endif
484
 
 
485
 
#ifndef OPTIMIZE_SHORT_SCAN
486
 
 
487
 
// #define ScanTo(item) \
488
 
// do {\
489
 
//   buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
490
 
//   while (buffer_rel_pos >= BUFFER_SIZE) {\
491
 
//      AdjustBufferEndForIncompleteSequences();\
492
 
//      Parser_Interface<W>::FinalizeBuffer_action();\
493
 
//      AdvanceBuffers();\
494
 
//      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);\
495
 
//   }\
496
 
// } while(0)
497
 
 
498
 
 
499
 
template <class B, WorkingCharacterSet W>
500
 
inline void ParsingEngine<B, W>::ScanTo(int item) {
501
 
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
502
 
        while (buffer_rel_pos >= BUFFER_SIZE) {
503
 
                AdjustBufferEndForIncompleteSequences();
504
 
                Parser_Interface<W>::FinalizeBuffer_action();
505
 
                AdvanceBuffers();
506
 
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
507
 
        }
508
 
}
509
 
 
510
 
template <class B, WorkingCharacterSet W>
511
 
inline void ParsingEngine<B, W>::ScanTextTo(int item) {
512
 
        text_or_markup_start = AbsPos();
513
 
        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
514
 
        while (buffer_rel_pos >= BUFFER_SIZE) {
515
 
                AdjustBufferEndForIncompleteSequences();
516
 
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
517
 
                text_or_markup_start = AbsPos();
518
 
                Parser_Interface<W>::FinalizeBuffer_action();
519
 
                AdvanceBuffers();
520
 
                buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
521
 
        }
522
 
}
523
 
#endif
524
 
 
525
 
template <class B, WorkingCharacterSet W>
526
 
void ParsingEngine<B, W>::WF_Error (XML_Constraint errCode) {
527
 
        printf("Error at position %i in input.\n", AbsPos());
528
 
        ShowConstraintError(errCode);
529
 
        exit(-1);
530
 
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
531
 
}
532
 
        
533
 
 
534
 
template <class B, WorkingCharacterSet W>
535
 
void ParsingEngine<B, W>::Validity_Error (XML_Constraint errCode) {
536
 
        printf("Error at position %i in input.\n", AbsPos());
537
 
        ShowConstraintError(errCode);
538
 
        exit(-1);
539
 
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
540
 
}
541
 
        
542
 
template <class B, WorkingCharacterSet W>
543
 
void ParsingEngine<B, W>::Syntax_Error (XML_NonTerminal errNT) {
544
 
        printf("Error at position %i in input.\n", AbsPos());
545
 
        ShowSyntaxError(errNT);
546
 
        exit(-1);
547
 
//      Error_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
548
 
}
549
 
        
550
 
 
551
 
/* Parse a comment beginning "<!--" */
552
 
template <class B, WorkingCharacterSet W>
553
 
void ParsingEngine<B, W>::Parse_Comment() {
554
 
 
555
 
        Advance(4); /* Skip "<!--". */
556
 
        ScanTo(Hyphen);
557
 
        while (!at_DoubleHyphen<B::Base>(cur())) {
558
 
                if(at_EOF())
559
 
                        Syntax_Error(NT_CDSect);
560
 
                Advance(2); /* Skip hyphen-nonhyphen pair */
561
 
                ScanTo(Hyphen); 
562
 
        }
563
 
        if (at_Comment_End<B::Base>(cur())) {
564
 
                Advance(3); /* Skip "-->". */
565
 
                Comment_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
566
 
        }
567
 
        else {
568
 
                Advance(2);  /* "--" */
569
 
                Syntax_Error(NT_Comment);
570
 
        }
571
 
}
572
 
 
573
 
/* Parse an end tag beginning "</" */
574
 
template <class B, WorkingCharacterSet W>
575
 
inline void ParsingEngine<B, W>::Parse_EndTag() {
576
 
        Advance(2); /* Skip "</". */
577
 
        int nameID = Parse_Name();
578
 
        if (AtChar<B::Base,'>'>(cur())) {
579
 
                Advance(1);
580
 
                EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
581
 
        }
582
 
        else {
583
 
                ScanTo(NonWS);
584
 
                if (AtChar<B::Base,'>'>(cur())) {
585
 
                        Advance(1);
586
 
                        EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
587
 
                }
588
 
                else Syntax_Error(NT_ETag);
589
 
        }
590
 
}
591
 
 
592
 
/* Parse a CDATA section beginning "<![CDATA". */
593
 
template <class B, WorkingCharacterSet W>
594
 
void ParsingEngine<B, W>::Parse_CDATA() {
595
 
                Advance(8); /* Skip "<![CDATA". */
596
 
        if (!AtChar<B::Base,'['>(cur())) {
597
 
                Syntax_Error(NT_CDStart);
598
 
        }
599
 
        else {  
600
 
                Advance(1);
601
 
                CDATA_start_action(GetCodeUnitPtr(text_or_markup_start));
602
 
                text_or_markup_start = AbsPos();
603
 
                ScanTextTo(CD_End_check);
604
 
                while (!at_CDATA_End<B::Base>(cur())) {
605
 
                        if (at_EOF())
606
 
                                Syntax_Error(NT_CDSect);
607
 
                        Advance(1);
608
 
                        ScanTextTo(CD_End_check);
609
 
                }
610
 
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), true);
611
 
                Advance(3); /* Skip "]]>". */
612
 
                CDATA_end_action(GetCodeUnitPtr(AbsPos()));
613
 
        }
614
 
}
615
 
 
616
 
template <class B, WorkingCharacterSet W>
617
 
void ParsingEngine<B, W>::Parse_EntityRef() {
618
 
    Advance(1);  // skip "&"
619
 
        int nameID = Parse_Name();  /* Name delimiter */
620
 
    if (!AtChar<B::Base,';'>(cur())) {
621
 
                Syntax_Error(NT_Reference);
622
 
    }
623
 
        else {
624
 
                Advance(1);
625
 
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
626
 
                
627
 
                //      The following code will replace Reference_Action.
628
 
                GEntity_info * this_info;
629
 
                Parser_Interface<W> * entity_parser;
630
 
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
631
 
                if (entityID == 0)
632
 
                        WF_Error(wfErr_wf_entdeclared);
633
 
                else{
634
 
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
635
 
                        if (this_info->is_external){
636
 
                                
637
 
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
638
 
                                WF_Error(wfErr_NoExternalRefs);
639
 
                        else {
640
 
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
641
 
                                        entity_parser->Parse_WF_Content();
642
 
                                        if(!entity_parser->at_EOF())
643
 
                                                Syntax_Error(NT_content);
644
 
                                        entity_parser->~Parser_Interface<W>();
645
 
                        }
646
 
                        }
647
 
                        else {
648
 
                                if (this_info->is_simple == true);
649
 
//                                      printf("Entity is %s\n",this_info->ReplacementText);
650
 
                                else{
651
 
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
652
 
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
653
 
                                        entity_parser->Parse_WF_Content();
654
 
                                        if(!entity_parser->at_EOF())
655
 
                                                Syntax_Error(NT_content);
656
 
                                        entity_parser->~Parser_Interface<W>();
657
 
                                }
658
 
                        }
659
 
                }
660
 
                
661
 
        }
662
 
}
663
 
 
664
 
template <class B, WorkingCharacterSet W>
665
 
void ParsingEngine<B, W>::Parse_EntityRef_inMixed(symbol_set_t elems) {
666
 
    Advance(1);  // skip "&"
667
 
        int nameID = Parse_Name();  /* Name delimiter */
668
 
    if (!AtChar<B::Base,';'>(cur())) {
669
 
                Syntax_Error(NT_Reference);
670
 
    }
671
 
        else {
672
 
                Advance(1);
673
 
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
674
 
                
675
 
                //      The following code will replace Reference_Action.
676
 
                GEntity_info * this_info;
677
 
                Parser_Interface<W> * entity_parser;
678
 
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
679
 
                if (entityID == 0)
680
 
                        WF_Error(wfErr_wf_entdeclared);
681
 
                else{
682
 
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
683
 
                        if (this_info->is_external){
684
 
                                
685
 
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
686
 
                                WF_Error(wfErr_NoExternalRefs);
687
 
                        else {
688
 
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
689
 
                                        entity_parser->Parse_MixedContent(elems);
690
 
                                        if(!entity_parser->at_EOF())
691
 
                                                Syntax_Error(NT_content);
692
 
                                        entity_parser->~Parser_Interface<W>();
693
 
                        }
694
 
                        }
695
 
                        else {
696
 
                                if (this_info->is_simple == true);
697
 
//                                      printf("Entity is %s\n",this_info->ReplacementText);
698
 
                                else{
699
 
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
700
 
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
701
 
                                        entity_parser->Parse_MixedContent(elems);
702
 
                                        if(!entity_parser->at_EOF())
703
 
                                                Syntax_Error(NT_content);
704
 
                                        entity_parser->~Parser_Interface<W>();
705
 
                                }
706
 
                        }
707
 
                }
708
 
                
709
 
        }
710
 
}
711
 
 
712
 
template <class B, WorkingCharacterSet W>
713
 
void ParsingEngine<B, W>::Parse_EntityRef_inAnyContent() {
714
 
    Advance(1);  // skip "&"
715
 
        int nameID = Parse_Name();  /* Name delimiter */
716
 
    if (!AtChar<B::Base,';'>(cur())) {
717
 
                Syntax_Error(NT_Reference);
718
 
    }
719
 
        else {
720
 
                Advance(1);
721
 
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
722
 
                
723
 
                //      The following code will replace Reference_Action.
724
 
                GEntity_info * this_info;
725
 
                Parser_Interface<W> * entity_parser;
726
 
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
727
 
                if (entityID == 0)
728
 
                        WF_Error(wfErr_wf_entdeclared);
729
 
                else{
730
 
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
731
 
                        if (this_info->is_external){
732
 
                                
733
 
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
734
 
                                WF_Error(wfErr_NoExternalRefs);
735
 
                        else {
736
 
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
737
 
                                        entity_parser->Parse_AnyContent();
738
 
                                        if(!entity_parser->at_EOF())
739
 
                                                Syntax_Error(NT_content);
740
 
                                        entity_parser->~Parser_Interface<W>();
741
 
                        }
742
 
                        }
743
 
                        else {
744
 
                                if (this_info->is_simple == true);
745
 
//                                      printf("Entity is %s\n",this_info->ReplacementText);
746
 
                                else{
747
 
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
748
 
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
749
 
                                        entity_parser->Parse_AnyContent();
750
 
                                        if(!entity_parser->at_EOF())
751
 
                                                Syntax_Error(NT_content);
752
 
                                        entity_parser->~Parser_Interface<W>();
753
 
                                }
754
 
                        }
755
 
                }
756
 
                
757
 
        }
758
 
}
759
 
 
760
 
template <class B, WorkingCharacterSet W>
761
 
void ParsingEngine<B, W>::Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state) {
762
 
    Advance(1);  // skip "&"
763
 
        int nameID = Parse_Name();  /* Name delimiter */
764
 
    if (!AtChar<B::Base,';'>(cur())) {
765
 
                Syntax_Error(NT_Reference);
766
 
    }
767
 
        else {
768
 
                Advance(1);
769
 
                Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
770
 
                
771
 
                //      The following code will replace Reference_Action.
772
 
                GEntity_info * this_info;
773
 
                Parser_Interface<W> * entity_parser;
774
 
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
775
 
                if (entityID == 0)
776
 
                        WF_Error(wfErr_wf_entdeclared);
777
 
                else{
778
 
                        this_info = Parser_Interface<W>::model_info->GEntityData[entityID-1];
779
 
                        if (this_info->is_external){
780
 
                                
781
 
                        if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
782
 
                                WF_Error(wfErr_NoExternalRefs);
783
 
                        else {
784
 
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
785
 
                                        entity_parser->Parse_ValidContent(cre, cur_state);
786
 
                                        if(!entity_parser->at_EOF())
787
 
                                                Syntax_Error(NT_content);
788
 
                                        entity_parser->~Parser_Interface<W>();
789
 
                        }
790
 
                        }
791
 
                        else {
792
 
                                if (this_info->is_simple == true);
793
 
//                                      printf("Entity is %s\n",this_info->ReplacementText);
794
 
                                else{
795
 
//                                      printf("Not a simple text: %s\n",this_info->ReplacementText);
796
 
                                        entity_parser = ParserFactory(this_info->ReplacementText, strlen(this_info->ReplacementText),Parser_Interface<W>::entity_Info, Parser_Interface<W>::model_info);
797
 
                                        entity_parser->Parse_ValidContent(cre, cur_state);
798
 
                                        if(!entity_parser->at_EOF())
799
 
                                                Syntax_Error(NT_content);
800
 
                                        entity_parser->~Parser_Interface<W>();
801
 
                                }
802
 
                        }
803
 
                }
804
 
                
805
 
        }
806
 
}
807
 
        
808
 
template <class B, WorkingCharacterSet W>
809
 
void ParsingEngine<B, W>::Parse_CharRef() {
810
 
        Advance(2);  // skip "&#"
811
 
        int ch_val = 0;
812
 
        if (AtChar<B::Base,'x'>(cur())) {
813
 
                Advance(1);
814
 
                while(at_HexDigit<B::Base>(cur())){
815
 
                        ch_val = HexVal<B::Base>(cur()[0]) + (ch_val<<4);
816
 
                        if (ch_val> 0x10FFFF )
817
 
                                WF_Error(wfErr_wf_Legalchar);
818
 
                        Advance(1);
819
 
                }
820
 
        }
821
 
        else {
822
 
                while(at_Digit<B::Base>(cur())){
823
 
                        ch_val = DigitVal<B::Base>(cur()[0]) + ch_val*10;
824
 
                        if (ch_val> 0x10FFFF )
825
 
                                WF_Error(wfErr_wf_Legalchar);
826
 
                        Advance(1);
827
 
                }
828
 
        }
829
 
        if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF))
830
 
                                WF_Error(wfErr_wf_Legalchar);    
831
 
                else  if (Parser_Interface<W>::entity_Info->version != XML_1_1)
832
 
                        if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA)))
833
 
                                WF_Error(wfErr_wf_Legalchar); 
834
 
                                
835
 
        if (!AtChar<B::Base,';'>(cur())) {
836
 
                        Syntax_Error(NT_CharRef);
837
 
        }
838
 
                else {
839
 
                        Advance(1);
840
 
                        Reference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
841
 
                }
842
 
}
843
 
 
844
 
template <class B, WorkingCharacterSet W>
845
 
void ParsingEngine<B, W>::Parse_PI (){
846
 
        int nameID;
847
 
        Advance(2); /* Skip "<?". */
848
 
        int target_start = AbsPos();
849
 
        if (at_XxMmLll<B::Base>(cur())) {
850
 
                nameID = Parse_Name();
851
 
                if (AbsPos() - target_start == 3) Syntax_Error(NT_PI);
852
 
        }
853
 
        else nameID = Parse_Name();
854
 
        PI_Target_action(GetCodeUnitPtr(target_start), LengthFrom(target_start));
855
 
        if (!at_PI_End<B::Base>(cur())) requireWS();
856
 
        ScanTo(QMark);
857
 
        while (!at_PI_End<B::Base>(cur())) {
858
 
                if(at_EOF())
859
 
                        Syntax_Error(NT_PI);
860
 
                Advance(1);
861
 
                ScanTo(QMark);
862
 
        }
863
 
        Advance(2); /* Skip "?>". */
864
 
        PI_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
865
 
}
866
 
 
867
 
/* Parse a start or empty element tag. */
868
 
template <class B, WorkingCharacterSet W>
869
 
inline void ParsingEngine<B, W>::Parse_StartTag (){
870
 
        int att_name_start;
871
 
        int att_val_start;
872
 
        int att_name_end, att_val_end;
873
 
        unsigned char quoteCh;
874
 
        Advance(1);
875
 
        int nameID = Parse_Name();  /* Name delimiter: WS, "/" or ">" */
876
 
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
877
 
        /* The following test optimizes the most common case of a
878
 
        start tag with no attributes.  */
879
 
        if (AtChar<B::Base,'>'>(cur())) {
880
 
                Advance(1);
881
 
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
882
 
        }
883
 
        else {
884
 
                ScanTo(NonWS);
885
 
                if (AtChar<B::Base,'>'>(cur())) {
886
 
                        Advance(1);
887
 
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
888
 
                }
889
 
                else if (at_EmptyElementDelim<B::Base>(cur())) {
890
 
                        Advance(2);
891
 
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
892
 
                }
893
 
                else do {
894
 
                        /* Must be an attribute-value pair or error. */
895
 
                        att_name_start = AbsPos();
896
 
                        int att_nameID = Parse_Name();
897
 
                        att_name_end = AbsPos();
898
 
                
899
 
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
900
 
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
901
 
                        else {
902
 
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
903
 
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
904
 
                                        break;
905
 
                                }                       
906
 
                        }
907
 
                        LastAttOccurrence[attID] = att_name_start;
908
 
                        /* The following optimized tests handle the frequently occurring 
909
 
                        case that there are no blanks on either side of the equals sign.
910
 
                        In many cases, the very first test handles 100% of actual
911
 
                        attribute-value pairs encountered. */
912
 
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
913
 
                        else {
914
 
                                ScanTo(NonWS);
915
 
                                if (!AtChar<B::Base,'='>(cur())) {
916
 
                                        Syntax_Error(NT_STag); 
917
 
                                        break;
918
 
                                }
919
 
                                Advance(1);
920
 
                                ScanTo(NonWS);
921
 
                                if (!AtQuote<B::Base>(cur())) {
922
 
                                        Syntax_Error(NT_STag); 
923
 
                                        break;
924
 
                                }
925
 
                        }
926
 
                        att_val_start = AbsPos()+1;
927
 
                        Parse_AttValue();
928
 
                        att_val_end = AbsPos()-1;
929
 
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
930
 
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
931
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
932
 
                        }
933
 
                        else {
934
 
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
935
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
936
 
                        }
937
 
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
938
 
                        if (AtChar<B::Base,'>'>(cur())) {
939
 
                                Advance(1);
940
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
941
 
                                break;
942
 
                        }
943
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
944
 
                                Advance(2);
945
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
946
 
                                break;
947
 
                        }
948
 
                        ScanTo(NonWS);
949
 
                        if (AtChar<B::Base,'>'>(cur())) {
950
 
                                Advance(1);
951
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
952
 
                                break;
953
 
                        }
954
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
955
 
                                Advance(2);
956
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
957
 
                                break;
958
 
                        }
959
 
                        else if (AbsPos() == att_val_end + 1) { 
960
 
                                /* No WS following att value */
961
 
                                Syntax_Error(NT_STag);
962
 
                                break;
963
 
                        }
964
 
                } while (1);
965
 
        }
966
 
}
967
 
 
968
 
template <class B, WorkingCharacterSet W>
969
 
inline void ParsingEngine<B, W>::text_if_nonnull_action(bool more){
970
 
        if (AbsPos() > text_or_markup_start) {
971
 
                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start), more);
972
 
                text_or_markup_start = AbsPos();
973
 
        }
974
 
}
975
 
 
976
 
template <class B, WorkingCharacterSet W>
977
 
void ParsingEngine<B, W>::Parse_WF_EndTag(int nameID) {
978
 
        Advance(2);
979
 
        int end_nameID = Parse_Name();
980
 
        if(end_nameID != nameID)
981
 
                WF_Error(wfErr_GIMatch);
982
 
        if (AtChar<B::Base,'>'>(cur())) {
983
 
                Advance(1);
984
 
                Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
985
 
        }
986
 
    else {
987
 
                ScanTo(NonWS);
988
 
                if (AtChar<B::Base,'>'>(cur())) {
989
 
                        Advance(1);
990
 
                        Parser_Interface<W>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
991
 
                }
992
 
                else Syntax_Error(NT_ETag);
993
 
    }
994
 
}
995
 
 
996
 
// template <>
997
 
// void ParsingEngine<UTF8_Buffer, UTF_8>::Parse_WF_EndTag(int nameID) {
998
 
//      Advance(2); /* Skip "</". */
999
 
//      
1000
 
//      int name_start = AbsPos();
1001
 
// //   ScanTo(NameFollow);
1002
 
// //   int lgth = AbsPos()-name_start;
1003
 
// 
1004
 
// #if (not defined(OMISSION)) or ((OMISSION != END_TAG_MATCHING)  and (OMISSION != NAME_LOOKUP))
1005
 
//      char * start_elem_name = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_name(nameID);
1006
 
//      int lgth = Parser_Interface<UTF_8>::model_info->symbol_table->Get_UTF8_lgth(nameID);
1007
 
//      char * end_elem_name = &((char *) x8data)[buffer_rel_pos];
1008
 
//      
1009
 
// #ifdef TEMPLATED_SIMD_LIB    
1010
 
//      BytePack byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) end_elem_name),
1011
 
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
1012
 
// #endif
1013
 
// #ifndef TEMPLATED_SIMD_LIB   
1014
 
//      BytePack byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) end_elem_name),
1015
 
//                                                                 sisd_load_unaligned((BytePack *) start_elem_name));
1016
 
// #endif
1017
 
//      if (lgth < 16) {
1018
 
//              int expected_bits = ~(-1 << lgth);
1019
 
//          if ((_mm_movemask_epi8(byte_compare) & expected_bits) != expected_bits) {
1020
 
//                      WF_Error(wfErr_GIMatch);
1021
 
//          }
1022
 
//      }
1023
 
//      else {
1024
 
//          /* Must compare with bytes beyond the first 16.  Set up to
1025
 
//             compare 16 bytes at a time, with the first additional compare
1026
 
//             overlapping with the first byte_compare. */
1027
 
//          int pos = (lgth - 1) % PACKSIZE + 1;
1028
 
// #ifdef TEMPLATED_SIMD_LIB
1029
 
//          byte_compare =  simd_or(byte_compare, simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1030
 
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1031
 
// #endif
1032
 
// #ifndef TEMPLATED_SIMD_LIB
1033
 
//          byte_compare =  simd_or(byte_compare, simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1034
 
//                                                                                      sisd_load_unaligned((BytePack *) &start_elem_name[pos])));
1035
 
// #endif
1036
 
//          pos += 16;
1037
 
//          while (pos < lgth) {
1038
 
//              if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1039
 
//                      WF_Error(wfErr_GIMatch);
1040
 
//              }
1041
 
// #ifdef TEMPLATED_SIMD_LIB
1042
 
//              byte_compare =  simd<8>::eq(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1043
 
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1044
 
// #endif
1045
 
// #ifndef TEMPLATED_SIMD_LIB
1046
 
//              byte_compare =  simd_eq_8(sisd_load_unaligned((BytePack *) &end_elem_name[pos]),
1047
 
//                                                sisd_load_unaligned((BytePack *) &start_elem_name[pos]));
1048
 
// #endif
1049
 
//              pos += 16;
1050
 
//          }
1051
 
//          if (_mm_movemask_epi8(byte_compare) != 0xFFFF) {
1052
 
//                      WF_Error(wfErr_GIMatch);
1053
 
//          }
1054
 
//      }
1055
 
//      Advance(lgth);
1056
 
// 
1057
 
// #endif
1058
 
// #if defined(OMISSION) and ((OMISSION == END_TAG_MATCHING) or (OMISSION == NAME_LOOKUP))
1059
 
//      ScanTo(NameFollow);
1060
 
// #endif
1061
 
// //   for(int i=0; i<lgth; i++) {
1062
 
// //           if (start_elem_name[i] != end_elem_name[i])
1063
 
// //                   WF_Error(wfErr_GIMatch);
1064
 
// //   }
1065
 
// //   if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
1066
 
// 
1067
 
//      if (AtChar<ASCII,'>'>(cur())) {
1068
 
//              Advance(1);
1069
 
//              Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1070
 
//      }
1071
 
//     else {
1072
 
//              ScanTo(NonWS);
1073
 
//              if (AtChar<ASCII,'>'>(cur())) {
1074
 
//                      Advance(1);
1075
 
//                      Parser_Interface<UTF_8>::EndTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1076
 
//              }
1077
 
//              else Syntax_Error(NT_ETag);
1078
 
//     }
1079
 
// }
1080
 
 
1081
 
 
1082
 
/* Parse a valid start or empty element tag. */
1083
 
template <class B, WorkingCharacterSet W>
1084
 
int ParsingEngine<B, W>::Parse_WF_StartTag (bool& is_emptyStartTag){
1085
 
        int att_name_start;
1086
 
        int att_val_start;
1087
 
        int att_name_end, att_val_end;
1088
 
        unsigned char quoteCh;
1089
 
        Advance(1);
1090
 
        
1091
 
        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1092
 
        int nameID = Parse_Name(); 
1093
 
        #endif
1094
 
        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1095
 
        ScanTo(NameFollow);
1096
 
        int nameID = 0;
1097
 
        #endif
1098
 
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
1099
 
        /* The following test optimizes the most common case of a
1100
 
        start tag with no attributes.  */
1101
 
        if (AtChar<B::Base,'>'>(cur())) {
1102
 
                Advance(1);
1103
 
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1104
 
        }
1105
 
        else {
1106
 
                ScanTo(NonWS);
1107
 
                if (AtChar<B::Base,'>'>(cur())) {
1108
 
                        Advance(1);
1109
 
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1110
 
                }
1111
 
                else if (at_EmptyElementDelim<B::Base>(cur())) {
1112
 
                        Advance(2);
1113
 
                        is_emptyStartTag = true;
1114
 
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1115
 
                }
1116
 
                else do {
1117
 
                        /* Must be an attribute-value pair or error. */
1118
 
                        att_name_start = AbsPos();
1119
 
                        #if (not defined(OMISSION)) or (OMISSION != NAME_LOOKUP)
1120
 
                        int att_nameID = Parse_Name(); 
1121
 
                        #endif
1122
 
                        #if (defined(OMISSION)) and (OMISSION == NAME_LOOKUP)
1123
 
                        ScanTo(NameFollow);
1124
 
                        int att_nameID = 0;
1125
 
                        #endif
1126
 
            att_name_end = AbsPos();
1127
 
                #if (not defined(OMISSION)) or ((OMISSION != ATTRIBUTE_UNIQUENESS) and (OMISSION != NAME_LOOKUP))
1128
 
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1129
 
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1130
 
                        else {
1131
 
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
1132
 
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
1133
 
                                        break;
1134
 
                                }                       
1135
 
                        }
1136
 
                        LastAttOccurrence[attID] = att_name_start;
1137
 
                 #endif
1138
 
                        /* The following optimized tests handle the frequently occurring 
1139
 
                        case that there are no blanks on either side of the equals sign.
1140
 
                        In many cases, the very first test handles 100% of actual
1141
 
                        attribute-value pairs encountered. */
1142
 
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
1143
 
                        else {
1144
 
                                ScanTo(NonWS);
1145
 
                                if (!AtChar<B::Base,'='>(cur())) {
1146
 
                                        Syntax_Error(NT_STag); 
1147
 
                                        break;
1148
 
                                }
1149
 
                                Advance(1);
1150
 
                                ScanTo(NonWS);
1151
 
                                if (!AtQuote<B::Base>(cur())) {
1152
 
                                        Syntax_Error(NT_STag); 
1153
 
                                        break;
1154
 
                                }
1155
 
                        }
1156
 
                        att_val_start = AbsPos()+1;
1157
 
                        Parse_AttValue();
1158
 
                        att_val_end = AbsPos()-1;
1159
 
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
1160
 
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1161
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1162
 
                        }
1163
 
                        else {
1164
 
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
1165
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
1166
 
                        }
1167
 
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
1168
 
                        if (AtChar<B::Base,'>'>(cur())) {
1169
 
                                Advance(1);
1170
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1171
 
                                break;
1172
 
                        }
1173
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
1174
 
                                Advance(2);
1175
 
                                is_emptyStartTag = true;        
1176
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1177
 
                                break;
1178
 
                        }
1179
 
                        ScanTo(NonWS);
1180
 
                        if (AtChar<B::Base,'>'>(cur())) {
1181
 
                                Advance(1);
1182
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1183
 
                                break;
1184
 
                        }
1185
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
1186
 
                                Advance(2);
1187
 
                                is_emptyStartTag = true;
1188
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1189
 
                                break;
1190
 
                        }
1191
 
                        else if (AbsPos() == att_val_end + 1) { 
1192
 
                                /* No WS following att value */
1193
 
                                Syntax_Error(NT_STag);
1194
 
                                break;
1195
 
                        }
1196
 
                } while (1);
1197
 
        }
1198
 
        return nameID;
1199
 
}
1200
 
 
1201
 
 
1202
 
 
1203
 
template <class B, WorkingCharacterSet W>
1204
 
void ParsingEngine<B, W>::Parse_WF_Element() {
1205
 
        bool is_emptyStartTag = false;
1206
 
        int nameID = Parse_WF_StartTag(is_emptyStartTag);
1207
 
#ifdef DEBUG
1208
 
        printf("Parse_Element: nameID = %d, is_emptyStartTag=%i\n",nameID, is_emptyStartTag);
1209
 
#endif
1210
 
        if (!is_emptyStartTag) {
1211
 
                Parse_WF_Content();
1212
 
                Parse_WF_EndTag(nameID);
1213
 
        }
1214
 
}
1215
 
 
1216
 
 
1217
 
template <class B, WorkingCharacterSet W>
1218
 
void ParsingEngine<B, W>::Parse_WF_Content() {
1219
 
        do {
1220
 
                text_or_markup_start = AbsPos();
1221
 
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1222
 
                if (at_ElementTag_Start<B::Base>(cur())) {
1223
 
                        text_if_nonnull_action(false);
1224
 
                        Parse_WF_Element();
1225
 
                }
1226
 
                else if (at_EndTag_Start<B::Base>(cur())) {
1227
 
                        text_if_nonnull_action(false);
1228
 
                        return;
1229
 
                }
1230
 
                else if (at_Comment_Start<B::Base>(cur())) {
1231
 
                        text_if_nonnull_action(false);
1232
 
                        Parse_Comment();
1233
 
                }
1234
 
                else if (at_CharRef_Start<B::Base>(cur())) {
1235
 
                        text_if_nonnull_action(true);
1236
 
                        Parse_CharRef();
1237
 
                }
1238
 
                else if (AtChar<B::Base,'&'>(cur())) {
1239
 
                        text_if_nonnull_action(true);
1240
 
                        Parse_EntityRef();
1241
 
                }
1242
 
                else if (at_CDATA_Start<B::Base>(cur())) {
1243
 
                        text_if_nonnull_action(true);
1244
 
                        Parse_CDATA();
1245
 
                }
1246
 
                else if (at_PI_Start<B::Base>(cur())) {
1247
 
                        text_if_nonnull_action(false);
1248
 
                        Parse_PI();
1249
 
                }
1250
 
                else if (at_CDATA_End<B::Base>(cur())) {
1251
 
                        text_if_nonnull_action(true);
1252
 
                        Advance(3);
1253
 
                        Syntax_Error(NT_CharData);
1254
 
                }
1255
 
                else if (at_EOF()) {
1256
 
                        text_if_nonnull_action(false);
1257
 
                        return;
1258
 
                }
1259
 
                else if (AtChar<B::Base,'<'>(cur())) {
1260
 
                        Syntax_Error(NT_markupdecl);
1261
 
                }
1262
 
                else {
1263
 
                        Advance(1);
1264
 
                        continue;
1265
 
                }
1266
 
        } while (1);
1267
 
}
1268
 
 
1269
 
 
1270
 
#ifndef MARKUP_PASS_CONTROL
1271
 
#ifndef MARKUP_SORTING
1272
 
template <class B, WorkingCharacterSet W>
1273
 
void ParsingEngine<B, W>::ParseContent() {
1274
 
        Parser_Interface<W>::DocumentStart_action();    
1275
 
        bool is_emptyStartTag = false;
1276
 
        do {
1277
 
                text_or_markup_start = AbsPos();
1278
 
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
1279
 
/*              if (AtChar<B::Base,'<'>(cur())) {
1280
 
                        text_if_nonnull_action();
1281
 
                        Parse_Markup<B, W>();
1282
 
                }*/
1283
 
                if (at_ElementTag_Start<B::Base>(cur())) {
1284
 
                        text_if_nonnull_action(false);
1285
 
                        Parse_StartTag();
1286
 
                }
1287
 
                else if (at_EndTag_Start<B::Base>(cur())) {
1288
 
                        text_if_nonnull_action(false);
1289
 
                        Parse_EndTag();
1290
 
                }
1291
 
                else if (at_Comment_Start<B::Base>(cur())) {
1292
 
                        text_if_nonnull_action(false);
1293
 
                        Parse_Comment();
1294
 
                }
1295
 
                else if (at_CharRef_Start<B::Base>(cur())) {
1296
 
                        text_if_nonnull_action(true);
1297
 
                        Parse_CharRef();
1298
 
                }
1299
 
                else if (AtChar<B::Base,'&'>(cur())) {
1300
 
                        text_if_nonnull_action(true);
1301
 
                        Parse_EntityRef();
1302
 
                }
1303
 
                else if (at_CDATA_Start<B::Base>(cur())) {
1304
 
                        text_if_nonnull_action(true);
1305
 
                        Parse_CDATA();
1306
 
                }
1307
 
                else if (at_PI_Start<B::Base>(cur())) {
1308
 
                        text_if_nonnull_action(false);
1309
 
                        Parse_PI();
1310
 
                }
1311
 
                else if (at_CDATA_End<B::Base>(cur())) {
1312
 
                        text_if_nonnull_action(true);
1313
 
                        Advance(3);
1314
 
                        Syntax_Error(NT_CharData);
1315
 
                }
1316
 
                else if (at_EOF()) {
1317
 
                        text_if_nonnull_action(false);
1318
 
                        break;
1319
 
                }
1320
 
                else if (AtChar<B::Base,'<'>(cur())) {
1321
 
                        Syntax_Error(NT_markupdecl);
1322
 
                }
1323
 
                else {
1324
 
                        Advance(1);
1325
 
                        continue;
1326
 
                }
1327
 
        } while (1);
1328
 
        Parser_Interface<W>::DocumentEnd_action();      
1329
 
}
1330
 
#endif
1331
 
#endif
1332
 
 
1333
 
template <class B, WorkingCharacterSet W>
1334
 
void ParsingEngine<B, W>::Parse_DocType (){
1335
 
 
1336
 
        int old_abspos, start_pos;
1337
 
        ScanTo(NonWS);
1338
 
        start_pos = AbsPos();
1339
 
        
1340
 
        if (at_DOCTYPE_start<B::Base>(cur()))
1341
 
        Advance(9);
1342
 
        else{
1343
 
//              printf("No Document definition!\n");
1344
 
                return;
1345
 
        }
1346
 
        requireWS();
1347
 
        int nameID = Parse_Name();
1348
 
 
1349
 
        old_abspos = AbsPos();  
1350
 
    ScanTo(NonWS);
1351
 
    if(at_SYSTEM<B::Base>(cur())||at_PUBLIC<B::Base>(cur())){
1352
 
        Parser_Interface<W>::model_info->has_external_DTD = true;
1353
 
        if(old_abspos == AbsPos())
1354
 
                Syntax_Error(NT_doctypedecl);
1355
 
        Parse_ExternalID(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info->external_DTD_pubidLiteral);
1356
 
        Parser_Interface<W> * entity_parser;
1357
 
        entity_parser = ParserFactory(Parser_Interface<W>::model_info->external_DTD_systemLiteral, Parser_Interface<W>::model_info);
1358
 
                entity_parser->Parse_ExtSubsetDecl();
1359
 
                entity_parser->~Parser_Interface<W>();
1360
 
    }
1361
 
    else Parser_Interface<W>::model_info->has_external_DTD = false;
1362
 
    ScanTo(NonWS);      
1363
 
 
1364
 
        if (AtChar<B::Base,'['>(cur())){
1365
 
                Advance(1);
1366
 
                Parse_IntSubset();
1367
 
                if (AtChar<B::Base,']'>(cur()))
1368
 
                        Advance(1);
1369
 
                else
1370
 
                Syntax_Error(NT_doctypedecl);
1371
 
                ScanTo(NonWS);
1372
 
        }
1373
 
        
1374
 
        if (AtChar<B::Base,'>'>(cur())){
1375
 
                Advance(1);  
1376
 
 
1377
 
                CRE_Seq * rslt = new CRE_Seq();
1378
 
                rslt->subCMs.push_back(new CRE_Name(nameID));
1379
 
                CM_RegExp * cre = new CM_RegExp();
1380
 
                cre->content_re = rslt;         
1381
 
                
1382
 
                int id_count = cre->content_re->Set_IDs(0);
1383
 
                cre->content_re->Set_First_Map();               
1384
 
                symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1385
 
                cre->content_re->follow_map[0] = id_count+1;
1386
 
                
1387
 
                cre->content_re->Set_Follow_Map(transition_map);
1388
 
                transition_map[0] = cre->content_re->first_map;
1389
 
                if (cre->content_re->matches_empty)
1390
 
                        transition_map[0][0]=id_count+1;
1391
 
                        
1392
 
                cre -> transition_map = transition_map;
1393
 
                
1394
 
                Parser_Interface<W>::model_info->rootModel = cre;
1395
 
                
1396
 
                /* Check for notations that were used, but not defined by the end of the DTD. */
1397
 
                #if (VALIDATION_MODE == ON)
1398
 
                hash_map<int, int >::iterator j;
1399
 
                for (j=Parser_Interface<W>::model_info->GlobalNotationTable.begin(); j!=Parser_Interface<W>::model_info->GlobalNotationTable.end(); j++) {
1400
 
                        if (j->second == -1)
1401
 
                                Validity_Error(vErr_notatn);
1402
 
                }
1403
 
                #endif
1404
 
        }
1405
 
        else
1406
 
                Syntax_Error(NT_doctypedecl);   
1407
 
}
1408
 
 
1409
 
template <class B, WorkingCharacterSet W>
1410
 
void ParsingEngine<B, W>::Parse_ExternalID (char *& systemLiteral, char *& pubidLiteral){
1411
 
        int quot_start, lgth;
1412
 
        if(at_SYSTEM<B::Base>(cur())){
1413
 
                Advance(6);
1414
 
                pubidLiteral = NULL;
1415
 
                requireWS();
1416
 
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1417
 
                quot_start = AbsPos()+1;
1418
 
                Parse_SystemLiteral (); /*  SystemLiteral */
1419
 
                lgth = AbsPos() - quot_start - 1;                       
1420
 
                systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1421
 
        }
1422
 
        else if (at_PUBLIC<B::Base>(cur())){
1423
 
                Advance(6);
1424
 
                requireWS();
1425
 
                if (!AtQuote<B::Base>(cur())) Syntax_Error(NT_ExternalID);
1426
 
                quot_start = AbsPos()+1;
1427
 
                Parse_PubidLiteral ();/*  PubidLiteral */
1428
 
                lgth = AbsPos() - quot_start - 1;                       
1429
 
                pubidLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1430
 
                systemLiteral = NULL;
1431
 
                if (AtChar<B::Base, '>'>(cur())) return;
1432
 
                requireWS();
1433
 
                if (AtQuote<B::Base>(cur())) {
1434
 
                        quot_start = AbsPos()+1;        
1435
 
                        Parse_SystemLiteral ();/*  SystemLiteral */
1436
 
                        lgth = AbsPos() - quot_start - 1;                       
1437
 
                        systemLiteral = copy_string(GetCodeUnitPtr(quot_start),lgth);
1438
 
                }
1439
 
        }
1440
 
        else
1441
 
                Syntax_Error(NT_ExternalID); 
1442
 
}
1443
 
 
1444
 
template <class B, WorkingCharacterSet W>
1445
 
void ParsingEngine<B, W>::Parse_SystemLiteral (){
1446
 
        unsigned char quoteCh;
1447
 
        if(AtQuote<B::Base>(cur())){
1448
 
                quoteCh = cur()[0];
1449
 
                Advance(1);
1450
 
        }       
1451
 
        ScanTo(Quote);                  
1452
 
        while (cur()[0] != quoteCh){
1453
 
                if(at_EOF())
1454
 
                        Syntax_Error(NT_SystemLiteral);
1455
 
                Advance(1);
1456
 
                ScanTo(Quote);
1457
 
        }
1458
 
        Advance(1);
1459
 
}
1460
 
 
1461
 
template <class B, WorkingCharacterSet W>
1462
 
void ParsingEngine<B, W>::Parse_PubidLiteral (){
1463
 
        unsigned char quoteCh;
1464
 
        quoteCh = cur()[0];
1465
 
        Advance(1);
1466
 
        while (at_PubidChar<B::Base>(cur()) && (cur()[0] != quoteCh)) {
1467
 
                Advance(1);
1468
 
        }
1469
 
        if (cur()[0] != quoteCh){
1470
 
                Syntax_Error(NT_PubidLiteral);
1471
 
        }
1472
 
        Advance(1);
1473
 
}
1474
 
 
1475
 
template <class B, WorkingCharacterSet W>
1476
 
void ParsingEngine<B, W>::Parse_IntSubset (){
1477
 
        
1478
 
        while(1){
1479
 
                ScanTo(NonWS);  
1480
 
                text_or_markup_start = AbsPos();
1481
 
                if (AtChar<B::Base,'%'>(cur()))
1482
 
                        Parse_PEReference();    
1483
 
                else if (at_PI_Start<B::Base>(cur())) {
1484
 
                        Parse_PI();
1485
 
                }
1486
 
                else if (at_Comment_Start<B::Base>(cur())) {
1487
 
                        Parse_Comment();
1488
 
                }
1489
 
                else if (AtChar<B::Base,'<'>(cur())){
1490
 
                        Advance(1);
1491
 
                        if(AtChar<B::Base,'!'>(cur())){
1492
 
                                Advance(1);
1493
 
                                if (at_ELEMENT<B::Base>(cur()))
1494
 
                                        Parse_Elementdecl();
1495
 
                                else if (at_ATTLIST<B::Base>(cur()))
1496
 
                                        Parse_AttlistDecl();
1497
 
                                else if (at_ENTITY<B::Base>(cur()))
1498
 
                                        Parse_Entitydecl();
1499
 
                                else if (at_NOTATION<B::Base>(cur()))
1500
 
                                        Parse_Notationdecl();
1501
 
                                else {
1502
 
                                        Syntax_Error(NT_markupdecl);            
1503
 
                                }                                                               
1504
 
                        }
1505
 
                        else
1506
 
                                Syntax_Error(NT_markupdecl); 
1507
 
                }
1508
 
                else if (AtChar<B::Base,']'>(cur())){
1509
 
                        break;
1510
 
                }
1511
 
                else
1512
 
                        Syntax_Error(NT_intSubset); 
1513
 
        }
1514
 
}
1515
 
 
1516
 
 
1517
 
template <class B, WorkingCharacterSet W>
1518
 
void ParsingEngine<B, W>::Parse_PEReference (){
1519
 
 
1520
 
        Advance(1); /* Skip "%". */
1521
 
        fprintf(stderr,"Parameter Reference has not been completed yet.\n");
1522
 
        exit(-1);
1523
 
        int nameID = Parse_Name(); 
1524
 
        if (AtChar<B::Base,';'>(cur())) {
1525
 
                Advance(1);
1526
 
                PEReference_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
1527
 
                PEntity_info * this_info;
1528
 
                Parser_Interface<W> * entity_parser;
1529
 
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]; 
1530
 
                if (entityID == 0)
1531
 
                        WF_Error(wfErr_wf_entdeclared);
1532
 
                else{
1533
 
                        this_info = Parser_Interface<W>::model_info->PEntityData[entityID-1];
1534
 
                        if (this_info->is_external){
1535
 
                                
1536
 
//                      if (Parser_Interface<W>::entity_Info->standalone != Standalone_no)
1537
 
//                              WF_Error(wfErr_NoExternalRefs);
1538
 
//                      else {
1539
 
                                        entity_parser = ParserFactory(this_info->systemLiteral, Parser_Interface<W>::model_info);
1540
 
                                        entity_parser->Parse_WF_Content();
1541
 
                                        if(!entity_parser->at_EOF())
1542
 
                                                Syntax_Error(NT_content);
1543
 
                                        entity_parser->~Parser_Interface<W>();
1544
 
//                      }
1545
 
                        }
1546
 
                        else {
1547
 
                        }
1548
 
                }
1549
 
        }
1550
 
        else
1551
 
                Syntax_Error(NT_PEReference);
1552
 
}
1553
 
 
1554
 
 
1555
 
template <class B, WorkingCharacterSet W>
1556
 
void ParsingEngine<B, W>::Parse_Elementdecl (){
1557
 
 
1558
 
        Advance(7); /* Skip "<!ELEMENT". */
1559
 
 
1560
 
    requireWS();
1561
 
        int nameID = Parse_Name();
1562
 
        int elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1563
 
 
1564
 
        requireWS();
1565
 
        ContentModel * cm;
1566
 
        /* Start parsing "contentspec"*/
1567
 
        if (at_EMPTY<B::Base>(cur())) {
1568
 
        Advance(5);
1569
 
        cm = new CM_Empty();
1570
 
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1571
 
        }
1572
 
    else if (at_ANY<B::Base>(cur())) {
1573
 
        Advance(3);
1574
 
        cm = new CM_Any();
1575
 
        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1576
 
    }
1577
 
    else {
1578
 
        if (AtChar<B::Base,'('>(cur()))
1579
 
                        Advance(1);
1580
 
                ScanTo(NonWS);
1581
 
                if (at_PCDATA<B::Base>(cur())){
1582
 
                        cm = Parse_RemainingMixed();
1583
 
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cm;
1584
 
                }
1585
 
                else{
1586
 
 
1587
 
                        CM_RegExp * cre = new CM_RegExp;
1588
 
                        cre->content_re = Parse_RemainingChildren();
1589
 
 
1590
 
                        int id_count = cre->content_re->Set_IDs(0);
1591
 
                        cre->content_re->Set_First_Map();       
1592
 
                        symbol_set_t * transition_map = new symbol_set_t[id_count+1];
1593
 
                        cre->content_re->follow_map[0] = id_count+1;
1594
 
                        
1595
 
                        cre->content_re->Set_Follow_Map(transition_map);
1596
 
                        transition_map[0] = cre->content_re->first_map;
1597
 
                        
1598
 
                        if (cre->content_re->matches_empty)
1599
 
                                transition_map[0][0]=id_count+1;
1600
 
                                
1601
 
                        cre -> transition_map = transition_map;
1602
 
                        
1603
 
                        Parser_Interface<W>::model_info->ContentModelData[nameID] = cre;
1604
 
                        cm = cre;
1605
 
                }                       
1606
 
    }
1607
 
    ScanTo(NonWS);    
1608
 
 
1609
 
        if (AtChar<B::Base,'>'>(cur())) {
1610
 
                Advance(1);
1611
 
        }
1612
 
        else
1613
 
                Syntax_Error(NT_elementdecl);
1614
 
}
1615
 
template <class B, WorkingCharacterSet W>
1616
 
ContentModel * ParsingEngine<B, W>::Parse_RemainingMixed (){
1617
 
        CM_Mixed * r = new CM_Mixed();
1618
 
        Advance(7);  /* Skip "#PCDATA". */
1619
 
    
1620
 
    if (AtChar<B::Base,')'>(cur())){
1621
 
        if (AtChar<B::Base,'*'>(cur())) {
1622
 
                Advance(2);
1623
 
                }
1624
 
                else {
1625
 
                        Advance(1);
1626
 
                }
1627
 
    }
1628
 
    else{
1629
 
        ScanTo(NonWS);
1630
 
        int k = 0;
1631
 
        while (AtChar<B::Base,'|'>(cur())){
1632
 
                        Advance(1);
1633
 
                        ScanTo(NonWS);
1634
 
                        int nameID = Parse_Name();
1635
 
                        r->elements[nameID] = ++k;
1636
 
                        ScanTo(NonWS);
1637
 
                }
1638
 
                if (at_Para_star<B::Base>(cur())) Advance(2);
1639
 
                else {
1640
 
                        Syntax_Error(NT_Mixed);
1641
 
                        exit(-1);
1642
 
        }
1643
 
    }
1644
 
    return r;
1645
 
}
1646
 
 
1647
 
 
1648
 
template <class B, WorkingCharacterSet W>
1649
 
Content_RE * ParsingEngine<B, W>::Parse_RemainingChildren (){
1650
 
        Content_RE * c1 = Parse_Cp();
1651
 
        Content_RE * r = c1;
1652
 
        ScanTo(NonWS);
1653
 
        if(AtChar<B::Base,'|'>(cur())){
1654
 
                CRE_Choice * rslt = new CRE_Choice;
1655
 
                rslt->subCMs.push_back(c1);
1656
 
                Advance(1);
1657
 
                ScanTo(NonWS);
1658
 
                rslt->subCMs.push_back(Parse_Cp());
1659
 
                ScanTo(NonWS);
1660
 
                while(!AtChar<B::Base,')'>(cur())){
1661
 
                        if(AtChar<B::Base,'|'>(cur()))
1662
 
                                Advance(1);
1663
 
                        else
1664
 
                                Syntax_Error(NT_children);
1665
 
                        ScanTo(NonWS);
1666
 
                        rslt->subCMs.push_back(Parse_Cp());
1667
 
                        ScanTo(NonWS);
1668
 
                }
1669
 
                Advance(1);
1670
 
                rslt->Compile();
1671
 
                r = rslt;
1672
 
        }
1673
 
        else if(AtChar<B::Base,','>(cur())){
1674
 
                CRE_Seq * rslt = new CRE_Seq;
1675
 
                rslt->subCMs.push_back(c1);
1676
 
                Advance(1);
1677
 
                ScanTo(NonWS);
1678
 
                rslt->subCMs.push_back(Parse_Cp());
1679
 
                ScanTo(NonWS);
1680
 
                while(!AtChar<B::Base,')'>(cur())){
1681
 
                        if(AtChar<B::Base,','>(cur()))
1682
 
                                Advance(1);
1683
 
                        else
1684
 
                                Syntax_Error(NT_children);
1685
 
                        ScanTo(NonWS);
1686
 
                        rslt->subCMs.push_back(Parse_Cp());
1687
 
                        ScanTo(NonWS);
1688
 
                }
1689
 
                Advance(1);
1690
 
                rslt->Compile();
1691
 
                r = rslt;
1692
 
        }       
1693
 
        else if(AtChar<B::Base,')'>(cur())){
1694
 
                Advance(1);
1695
 
        }
1696
 
        else
1697
 
                Syntax_Error(NT_children);
1698
 
                
1699
 
        if (AtChar<B::Base,'?'>(cur())) {
1700
 
                Advance(1);
1701
 
                r = new CRE_Opt(r);
1702
 
        }
1703
 
        else if (AtChar<B::Base,'*'>(cur())) {
1704
 
                Advance(1);
1705
 
                r = new CRE_Star(r);
1706
 
        }
1707
 
        else if (AtChar<B::Base,'+'>(cur())) {
1708
 
                Advance(1);
1709
 
                r = new CRE_Plus(r);
1710
 
        }
1711
 
 
1712
 
        return r;
1713
 
}
1714
 
 
1715
 
template <class B, WorkingCharacterSet W>
1716
 
Content_RE * ParsingEngine<B, W>::Parse_Cp (){
1717
 
        if (AtChar<B::Base,'('>(cur())){
1718
 
                Advance(1);
1719
 
                ScanTo(NonWS);
1720
 
                Parse_RemainingChildren();
1721
 
        }
1722
 
        else{
1723
 
                int nameID = Parse_Name();
1724
 
                CRE_Name * r = new CRE_Name(nameID);
1725
 
 
1726
 
                if (AtChar<B::Base,'?'>(cur())) {
1727
 
                        Advance(1);
1728
 
                        return new CRE_Opt(r);
1729
 
                }
1730
 
                else if (AtChar<B::Base,'*'>(cur())) {
1731
 
                        Advance(1);
1732
 
                        return new CRE_Star(r);
1733
 
                }
1734
 
                else if (AtChar<B::Base,'+'>(cur())) {
1735
 
                        Advance(1);
1736
 
                        return new CRE_Plus(r);
1737
 
                }
1738
 
                else return r;
1739
 
        }
1740
 
}
1741
 
 
1742
 
template <class B, WorkingCharacterSet W>
1743
 
void ParsingEngine<B, W>::Parse_AttlistDecl (){
1744
 
        
1745
 
        int old_abspos;
1746
 
        
1747
 
        int name_start;
1748
 
        int lgth;
1749
 
        
1750
 
        int elemID;
1751
 
        int attID;
1752
 
        
1753
 
        Advance(7); /* Skip "ATTLIST. */
1754
 
        requireWS();
1755
 
        
1756
 
        int nameID = Parse_Name();
1757
 
        elemID = Parser_Interface<W>::model_info->getOrInsertGlobalElement(nameID);
1758
 
        
1759
 
        old_abspos = AbsPos();
1760
 
        ScanTo(NonWS);
1761
 
        while(!AtChar<B::Base,'>'>(cur())) {
1762
 
                if(old_abspos == AbsPos())
1763
 
                Syntax_Error(NT_AttlistDecl);
1764
 
                
1765
 
                int att_nameID = Parse_Name();
1766
 
                
1767
 
                attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
1768
 
                if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
1769
 
        ATT_info * this_info = new ATT_info;
1770
 
        this_info->globalATT_id = attID;
1771
 
        requireWS();
1772
 
        if (at_CDATA<B::Base>(cur())){
1773
 
                Advance(5);
1774
 
                this_info->attType = CDATA_att;
1775
 
        }
1776
 
        else if(at_ID<B::Base>(cur())){
1777
 
                Advance(2);
1778
 
                this_info->attType = ID_att;
1779
 
        }
1780
 
        /* Make sure to check IDREFS before IDREF*/
1781
 
        else if(at_IDREFS<B::Base>(cur())){
1782
 
                Advance(6);
1783
 
                this_info->attType = IDREFS_att;
1784
 
        }
1785
 
        else if(at_IDREF<B::Base>(cur())){
1786
 
                Advance(5);
1787
 
                this_info->attType = IDREF_att;
1788
 
        }
1789
 
        else if(at_ENTITY<B::Base>(cur())){
1790
 
                Advance(6);
1791
 
                this_info->attType = ENTITY_att;
1792
 
        }
1793
 
        else if(at_ENTITIES<B::Base>(cur())){
1794
 
                Advance(8);
1795
 
                this_info->attType = ENTITIES_att;
1796
 
        }
1797
 
        /* Make sure to check NMTOKENS before NMTOKEN*/
1798
 
        else if(at_NMTOKENS<B::Base>(cur())){
1799
 
                Advance(8);
1800
 
                this_info->attType = NMTOKENS_att;
1801
 
        }
1802
 
        else if(at_NMTOKEN<B::Base>(cur())){
1803
 
                Advance(7);
1804
 
                this_info->attType = NMTOKEN_att;
1805
 
        }
1806
 
        else if(at_NOTATION<B::Base>(cur())){ /* NotationType = 'NOTATION' S Enumeration
1807
 
                                                                         when Nmtoken = Name */
1808
 
                Advance(8);
1809
 
                        requireWS();
1810
 
                Parse_Notation(this_info);
1811
 
                this_info->attType = NOTATION_att;
1812
 
        }
1813
 
        else if(AtChar<B::Base,'('>(cur())){
1814
 
                Parse_Enumeration(this_info);
1815
 
                this_info->attType = enumeration_att;
1816
 
        }
1817
 
        else
1818
 
                Syntax_Error(NT_AttlistDecl);
1819
 
        requireWS();
1820
 
        Parse_DefaultDecl(this_info);
1821
 
 
1822
 
                ScanTo(NonWS);
1823
 
                Parser_Interface<W>::model_info->ElementAttributeData[elemID].push_back(this_info);
1824
 
        }
1825
 
 
1826
 
        Advance(1);
1827
 
}
1828
 
 
1829
 
template <class B, WorkingCharacterSet W>
1830
 
void ParsingEngine<B, W>::Parse_Notation (ATT_info * this_info){
1831
 
 
1832
 
        if(AtChar<B::Base,'('>(cur()))
1833
 
                Advance(1);
1834
 
        else
1835
 
                Syntax_Error(NT_NotationType);
1836
 
        ScanTo(NonWS);
1837
 
        
1838
 
    int notn_nameID = Parse_Name();
1839
 
 
1840
 
        /*Notation name is not in the global table!*/
1841
 
        if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1842
 
                Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1843
 
        
1844
 
        ScanTo(NonWS);
1845
 
        while(AtChar<B::Base,'|'>(cur())){
1846
 
                Advance(1);
1847
 
                ScanTo(NonWS);  
1848
 
                notn_nameID = Parse_Name();
1849
 
                        
1850
 
                if(Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID]==0)
1851
 
//                      Validity_Error(vErr_notatn);
1852
 
                        Parser_Interface<W>::model_info->GlobalNotationTable[notn_nameID] = -1;
1853
 
                        
1854
 
                ScanTo(NonWS);
1855
 
        }
1856
 
        if (AtChar<B::Base,')'>(cur())) 
1857
 
                Advance(1);
1858
 
        else
1859
 
                Syntax_Error(NT_NotationType);
1860
 
}
1861
 
 
1862
 
template <class B, WorkingCharacterSet W>
1863
 
void ParsingEngine<B, W>::Parse_Enumeration (ATT_info * this_info){
1864
 
 
1865
 
        int enumCount=0;
1866
 
        if(AtChar<B::Base,'('>(cur()))
1867
 
                Advance(1);
1868
 
        else
1869
 
                Syntax_Error(NT_Enumeration);
1870
 
        ScanTo(NonWS);
1871
 
        
1872
 
        int nmtokenID = Parse_Nmtoken();
1873
 
        
1874
 
        this_info->enumValues[nmtokenID]=++(enumCount);
1875
 
        
1876
 
        ScanTo(NonWS);
1877
 
        while(AtChar<B::Base,'|'>(cur())){
1878
 
                Advance(1);
1879
 
                ScanTo(NonWS);  
1880
 
                int nmtokenID = Parse_Nmtoken();
1881
 
        
1882
 
                int enumID = this_info->enumValues[nmtokenID];
1883
 
                if(enumID==0){  
1884
 
                        this_info->enumValues[nmtokenID]=++(enumCount);
1885
 
                        enumID = enumCount;
1886
 
                }
1887
 
                else if(!StrictWellFormedness){
1888
 
                        Validity_Error(vErr_NoDuplicateTokens);
1889
 
                }
1890
 
                ScanTo(NonWS);
1891
 
        }
1892
 
        if (AtChar<B::Base,')'>(cur())) 
1893
 
                Advance(1);
1894
 
        else
1895
 
                Syntax_Error(NT_Enumeration);
1896
 
}
1897
 
 
1898
 
template <class B, WorkingCharacterSet W>
1899
 
void ParsingEngine<B, W>::Parse_DefaultDecl (ATT_info * this_info){
1900
 
        if(at_REQUIRED<B::Base>(cur())){
1901
 
                Advance(9);
1902
 
                this_info->defaultKind = REQUIRED_att;
1903
 
        }
1904
 
        else if(at_IMPLIED<B::Base>(cur())){
1905
 
                Advance(8);
1906
 
                this_info->defaultKind = IMPLIED_att;
1907
 
        }
1908
 
        else {
1909
 
                if(at_FIXED<B::Base>(cur())){
1910
 
                        Advance(6);
1911
 
                        requireWS();
1912
 
                        this_info->defaultKind = FIXED_att;
1913
 
                }
1914
 
                else this_info->defaultKind = DEFAULT_att;
1915
 
                if(AtQuote<B::Base>(cur())){
1916
 
                        int quot_start = AbsPos()+1;
1917
 
                        Parse_AttValue();
1918
 
                        /* need to normalize */
1919
 
                        this_info->defaultValueLgth = AbsPos() - quot_start - 1;
1920
 
                        
1921
 
                        this_info->defaultValue = new unsigned char[this_info->defaultValueLgth+1];
1922
 
                        memcpy(this_info->defaultValue, GetCodeUnitPtr(quot_start),this_info->defaultValueLgth); 
1923
 
                        this_info->defaultValue[this_info->defaultValueLgth] = '\0';
1924
 
                        }
1925
 
                else
1926
 
                        Syntax_Error(NT_DefaultDecl);
1927
 
        }
1928
 
}
1929
 
 
1930
 
template <class B, WorkingCharacterSet W>
1931
 
void ParsingEngine<B, W>::Parse_Entitydecl (){
1932
 
        
1933
 
        int name_start;
1934
 
        int quot_start;
1935
 
        int lgth;
1936
 
        int old_abspos;
1937
 
        char * s;
1938
 
        
1939
 
        Advance(6); /* Skip "ENTITY. */
1940
 
        requireWS();
1941
 
        
1942
 
        if (AtChar<B::Base,'%'>(cur())){
1943
 
                Advance(1);
1944
 
                requireWS();
1945
 
                
1946
 
                int nameID = Parse_Name();
1947
 
                PEntity_info * this_info = new PEntity_info;
1948
 
                int entityID = Parser_Interface<W>::model_info->GlobalPEntityTable[nameID];
1949
 
                if(entityID==0){        
1950
 
                        Parser_Interface<W>::model_info->GlobalPEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalPEntityCount);
1951
 
                        entityID = Parser_Interface<W>::model_info->globalPEntityCount;
1952
 
                        this_info->globalPEntity_id = entityID;
1953
 
                }
1954
 
                else
1955
 
                        printf("Warning: Entity definition already exist!\n");
1956
 
        
1957
 
                requireWS();
1958
 
                if(AtQuote<B::Base>(cur())){
1959
 
                Parse_PEntityValue(this_info);
1960
 
                this_info->is_external = false;
1961
 
        }
1962
 
        else {
1963
 
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1964
 
                this_info->is_external = true;
1965
 
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1966
 
        }
1967
 
        Parser_Interface<W>::model_info->PEntityData.push_back(this_info);
1968
 
        }
1969
 
        else{
1970
 
                int nameID = Parse_Name();
1971
 
        
1972
 
                GEntity_info * this_info = new GEntity_info();
1973
 
                int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID];
1974
 
                if(entityID==0){        
1975
 
                        Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]=++(Parser_Interface<W>::model_info->globalGEntityCount);
1976
 
                        entityID = Parser_Interface<W>::model_info->globalGEntityCount;
1977
 
                        this_info->globalGEntity_id = entityID;
1978
 
                }
1979
 
                else
1980
 
                        printf("Warning: Entity definition already exists!\n");
1981
 
                        
1982
 
                requireWS();
1983
 
                
1984
 
                if(AtQuote<B::Base>(cur())){
1985
 
                Parse_GEntityValue(this_info);                  
1986
 
                this_info->is_external = false;
1987
 
        }
1988
 
        else {
1989
 
                Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
1990
 
                this_info->is_external = true;
1991
 
                if (this_info->systemLiteral == NULL) Syntax_Error(NT_EntityDecl);
1992
 
                        old_abspos = AbsPos();
1993
 
                        ScanTo(NonWS);
1994
 
                if(at_NDATA<B::Base>(cur())){
1995
 
                        if(old_abspos == AbsPos())
1996
 
                                Syntax_Error(NT_EntityDecl);
1997
 
                        else
1998
 
                                Advance(5);
1999
 
                        requireWS();
2000
 
                        name_start = AbsPos();
2001
 
                        int nameID = Parse_Name();
2002
 
                        lgth = AbsPos() - name_start;
2003
 
                                this_info->NDataName = copy_string(GetCodeUnitPtr(name_start),lgth);
2004
 
                }
2005
 
                }
2006
 
        Parser_Interface<W>::model_info->GEntityData.push_back(this_info);
2007
 
        }
2008
 
        ScanTo(NonWS);
2009
 
        if (AtChar<B::Base,'>'>(cur())){
2010
 
                Advance(1);
2011
 
        }
2012
 
        else
2013
 
                Syntax_Error(NT_EntityDecl);
2014
 
}
2015
 
 
2016
 
template <class B, WorkingCharacterSet W>
2017
 
void ParsingEngine<B, W>::Parse_Notationdecl (){
2018
 
 
2019
 
        int old_abspos;
2020
 
        Advance(8); /* Skip "NOTATION. */
2021
 
        requireWS();
2022
 
        
2023
 
        int nameID = Parse_Name();
2024
 
 
2025
 
        int notationID = Parser_Interface<W>::model_info->GlobalNotationTable[nameID];
2026
 
        /* notationID == -1: used but not yet defined; == 0: new, > 0 prev. defined */
2027
 
        if(notationID <= 0){    
2028
 
                Parser_Interface<W>::model_info->GlobalNotationTable[nameID]=++(Parser_Interface<W>::model_info->globalNotationCount);
2029
 
                notationID = Parser_Interface<W>::model_info->globalNotationCount;
2030
 
        }
2031
 
        else /*Duplicate notation name!*/
2032
 
                Validity_Error(vErr_NoDuplicateTokens);
2033
 
        Notation_info * this_info = new Notation_info;
2034
 
        ScanTo(NonWS);          
2035
 
    Parse_ExternalID(this_info->systemLiteral, this_info->pubidLiteral);
2036
 
        ScanTo(NonWS);
2037
 
        if (AtChar<B::Base,'>'>(cur())) {
2038
 
                Advance(1);
2039
 
        }
2040
 
        else
2041
 
                Syntax_Error(NT_NotationDecl);
2042
 
}
2043
 
 
2044
 
template <class B, WorkingCharacterSet W>
2045
 
void ParsingEngine<B, W>::requireWS(){
2046
 
        
2047
 
    int old_abspos = AbsPos();  
2048
 
    ScanTo(NonWS);
2049
 
    if(old_abspos == AbsPos())
2050
 
        Syntax_Error(NT_S);
2051
 
}
2052
 
 
2053
 
template <class B, WorkingCharacterSet W>
2054
 
void ParsingEngine<B, W>::Parse_AttValue(){
2055
 
        
2056
 
        int     quoteCh = cur()[0];
2057
 
        Advance(1); /* Skip " or ' */
2058
 
 
2059
 
        ScanTo(Quote);                  
2060
 
        while (cur()[0] != quoteCh){
2061
 
                if (at_CharRef_Start<B::Base>(cur())){
2062
 
                        Parse_CharRef();
2063
 
                        ScanTo(Quote);
2064
 
                }
2065
 
                else if (AtChar<B::Base,'&'>(cur())){
2066
 
                        Parse_EntityRef();
2067
 
                        ScanTo(Quote);
2068
 
                }
2069
 
                else if (AtQuote<B::Base>(cur())) {
2070
 
                        Advance(1);
2071
 
                        ScanTo(Quote);
2072
 
                }
2073
 
                else /* if (AtChar<B::Base,'<'>(cur())) */
2074
 
                        WF_Error(wfErr_CleanAttrVals);
2075
 
        }
2076
 
        Advance(1);
2077
 
}
2078
 
 
2079
 
template <class B, WorkingCharacterSet W>
2080
 
void ParsingEngine<B, W>::Parse_GEntityValue(GEntity_info * this_info){
2081
 
        
2082
 
        int     quoteCh = cur()[0];
2083
 
        Advance(1); /* Skip " or ' */
2084
 
        this_info->is_simple = true;
2085
 
        int quot_start = AbsPos();
2086
 
        char * replText;
2087
 
        ScanTo(Quote);          
2088
 
        replText = copy_string(GetCodeUnitPtr(quot_start),AbsPos()-quot_start);
2089
 
        while (cur()[0] != quoteCh){
2090
 
                if (at_CharRef_Start<B::Base>(cur())){
2091
 
                        strcat (replText,Replace_CharRef());
2092
 
                        quot_start = AbsPos();
2093
 
                        ScanTo(Quote);
2094
 
                }
2095
 
                else if (AtQuote<B::Base>(cur())) {
2096
 
                        quot_start = AbsPos();
2097
 
                        Advance(1);
2098
 
                        ScanTo(Quote);
2099
 
                }
2100
 
                else if (at_EOF()) {
2101
 
                        Syntax_Error(NT_EntityValue);
2102
 
                }
2103
 
                else { /* '<' or '&' found */
2104
 
                        quot_start = AbsPos();
2105
 
                        Advance(1);
2106
 
                        ScanTo(Quote);
2107
 
                        this_info->is_simple = false;                   
2108
 
                }
2109
 
                replText = cat_string (replText,(char *)GetCodeUnitPtr(quot_start), strlen(replText), AbsPos()-quot_start);
2110
 
        }
2111
 
        this_info->ReplacementText = replText;
2112
 
        Advance(1);
2113
 
}
2114
 
 
2115
 
template <class B, WorkingCharacterSet W>
2116
 
char * ParsingEngine<B, W>::Replace_EntityRef(bool& is_simple){
2117
 
        Advance(1);
2118
 
        int nameID = Parse_Name(); 
2119
 
        if (AtChar<B::Base,';'>(cur()))
2120
 
                Advance(1);
2121
 
        else
2122
 
                Syntax_Error(NT_EntityValue);
2123
 
        int entityID = Parser_Interface<W>::model_info->GlobalGEntityTable[nameID]; 
2124
 
        if (entityID == 0)
2125
 
                WF_Error(wfErr_wf_entdeclared);
2126
 
        else{
2127
 
                if (Parser_Interface<W>::model_info->GEntityData[entityID-1]->is_simple == false)
2128
 
                        is_simple = false;
2129
 
                return Parser_Interface<W>::model_info->GEntityData[entityID-1]->ReplacementText;
2130
 
        }
2131
 
        
2132
 
}
2133
 
 
2134
 
template <class B, WorkingCharacterSet W>
2135
 
void ParsingEngine<B, W>::Parse_PEntityValue(PEntity_info * this_info){
2136
 
        fprintf(stderr,"parsing of parameter entity value has not been completed yet.\n");
2137
 
        exit(-1);
2138
 
}
2139
 
 
2140
 
template <class B, WorkingCharacterSet W>
2141
 
char * ParsingEngine<B, W>::Replace_CharRef(){
2142
 
        Advance(2);
2143
 
        fprintf(stderr,"Replacement of Character Reference has not been completed yet.\n");
2144
 
        exit(-1);
2145
 
}
2146
 
 
2147
 
template <class B, WorkingCharacterSet W>
2148
 
void ParsingEngine<B, W>::Parse_Prolog(){
2149
 
        ScanTo(NonWS);
2150
 
        int old_pos = AbsPos();
2151
 
        while (!at_DOCTYPE_start<B::Base>(cur())) {             
2152
 
                text_or_markup_start = AbsPos();
2153
 
                if (at_Comment_Start<B::Base>(cur())) 
2154
 
                        Parse_Comment();
2155
 
                else if (at_PI_Start<B::Base>(cur()))
2156
 
                                Parse_PI();
2157
 
                else{
2158
 
                        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2159
 
                        return;
2160
 
                }
2161
 
                ScanTo(NonWS);
2162
 
        }
2163
 
        Parse_DocType();
2164
 
        ScanTo(NonWS);
2165
 
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){                
2166
 
                text_or_markup_start = AbsPos();
2167
 
                if (at_Comment_Start<B::Base>(cur()))
2168
 
                        Parse_Comment();
2169
 
                else 
2170
 
                        Parse_PI();
2171
 
                ScanTo(NonWS);
2172
 
        }
2173
 
        Prolog_action(GetCodeUnitPtr(old_pos), LengthFrom(old_pos));
2174
 
}
2175
 
 
2176
 
template <class B, WorkingCharacterSet W>
2177
 
void ParsingEngine<B, W>::Parse_ExtSubsetDecl() {
2178
 
        ScanTo(NonWS);
2179
 
        int start_pos=AbsPos();
2180
 
        while(!at_EOF()){
2181
 
                if(at_condSect_start<B::Base>(cur())){          
2182
 
                        Advance(3);
2183
 
                        ScanTo(NonWS);
2184
 
                        if (at_INCLUDE<B::Base>(cur())){
2185
 
                                Advance(7);
2186
 
                                ScanTo(NonWS);
2187
 
                                if(AtChar<B::Base,'['>(cur())){
2188
 
                                        Advance(1);
2189
 
                                        Parse_ExtSubsetDecl();
2190
 
                                        if(at_CDATA_End<B::Base>(cur()))
2191
 
                                                Advance(3);
2192
 
                                        else Syntax_Error(NT_includeSect);
2193
 
                                }
2194
 
                                else Syntax_Error(NT_includeSect);
2195
 
                        }
2196
 
                        else if (at_IGNORE<B::Base>(cur())){
2197
 
                                Advance(6);
2198
 
                                ScanTo(NonWS);          
2199
 
                                if(AtChar<B::Base,'['>(cur())){
2200
 
                                        int section_depth=1;
2201
 
                                        Advance(1);
2202
 
                                        while(!at_EOF()){
2203
 
                                                ScanTextTo(MarkupStart);
2204
 
                                                if(at_condSect_start<B::Base>(cur())){
2205
 
                                                        Advance(3);
2206
 
                                                        section_depth++;
2207
 
                                                }
2208
 
                                                else if(at_CDATA_End<B::Base>(cur())){
2209
 
                                                        Advance(3);
2210
 
                                                        section_depth--;
2211
 
                                                }
2212
 
                                                else
2213
 
                                                        Advance(1);
2214
 
                                                if(section_depth==0) return;                                    
2215
 
                                        }
2216
 
                                        Syntax_Error(NT_ignoreSectContents);    
2217
 
                                }
2218
 
                                else Syntax_Error(NT_ignoreSect);
2219
 
                        }
2220
 
                        else Syntax_Error(NT_conditionalSect);
2221
 
                }
2222
 
                else if (AtChar<B::Base,'%'>(cur()))
2223
 
                        Parse_PEReference();    
2224
 
                else if (at_PI_Start<B::Base>(cur())) {
2225
 
                        Parse_PI();
2226
 
                }
2227
 
                else if (at_Comment_Start<B::Base>(cur())) {
2228
 
                        Parse_Comment();
2229
 
                }
2230
 
                else if (AtChar<B::Base,'<'>(cur())){
2231
 
                        Advance(1);
2232
 
 
2233
 
                        if(AtChar<B::Base,'!'>(cur())){
2234
 
                                Advance(1);
2235
 
                                if(at_ELEMENT<B::Base>(cur()))
2236
 
                                        Parse_Elementdecl();
2237
 
                                else if(at_ATTLIST<B::Base>(cur()))
2238
 
                                        Parse_AttlistDecl();
2239
 
                                else if(at_ENTITY<B::Base>(cur()))
2240
 
                                        Parse_Entitydecl();
2241
 
                                else if(at_NOTATION<B::Base>(cur()))
2242
 
                                        Parse_Notationdecl();                                   
2243
 
                                else{
2244
 
                                        Syntax_Error(NT_markupdecl);    
2245
 
                                }                                                               
2246
 
                        }
2247
 
                        else
2248
 
                                Syntax_Error(NT_markupdecl); 
2249
 
                }
2250
 
                else
2251
 
                        Syntax_Error(NT_extSubsetDecl); 
2252
 
                ScanTo(NonWS);
2253
 
        }
2254
 
        ExtSubsetDecl_action(GetCodeUnitPtr(start_pos), LengthFrom(start_pos));
2255
 
}
2256
 
 
2257
 
/* Parse a valid start or empty element tag. */
2258
 
template <class B, WorkingCharacterSet W>
2259
 
inline int ParsingEngine<B, W>::Parse_ValidStartTag (bool& is_emptyStartTag){
2260
 
        int att_name_start;
2261
 
        int att_val_start;
2262
 
        int att_name_end, att_val_end;
2263
 
        unsigned char quoteCh;
2264
 
        Advance(1);
2265
 
 
2266
 
        int nameID = Parse_Name();  
2267
 
        int elemID = Parser_Interface<W>::model_info->GlobalElementTable[nameID];
2268
 
        if(elemID==0)
2269
 
                        Validity_Error(vErr_elementvalid);
2270
 
        
2271
 
        ElementName_action(GetCodeUnitPtr(text_or_markup_start+1), LengthFrom(text_or_markup_start+1));
2272
 
        /* The following test optimizes the most common case of a
2273
 
        start tag with no attributes.  */
2274
 
        if (AtChar<B::Base,'>'>(cur())) {
2275
 
                Advance(1);
2276
 
                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2277
 
        }
2278
 
        else {
2279
 
                ScanTo(NonWS);
2280
 
                if (AtChar<B::Base,'>'>(cur())) {
2281
 
                        Advance(1);
2282
 
                        StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2283
 
                }
2284
 
                else if (at_EmptyElementDelim<B::Base>(cur())) {
2285
 
                        Advance(2);
2286
 
                        is_emptyStartTag = true;
2287
 
                        EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2288
 
                }
2289
 
                else do {
2290
 
                        /* Must be an attribute-value pair or error. */
2291
 
                        att_name_start = AbsPos();
2292
 
                        int att_nameID = Parse_Name();
2293
 
                        #if (not defined(OMISSION)) or (OMISSION != ATTRIBUTE_UNIQUENESS) 
2294
 
                        int attID = Parser_Interface<W>::model_info->getOrInsertGlobalAttName(att_nameID);
2295
 
                        if (attID >= LastAttOccurrence.size()) LastAttOccurrence.push_back(0);
2296
 
                        else {
2297
 
                                if (LastAttOccurrence[attID] > text_or_markup_start) {
2298
 
                                        WF_Error(wfErr_uniqattspec); /* Duplicate attribute. */
2299
 
                                        break;
2300
 
                                }                       
2301
 
                        }
2302
 
                        LastAttOccurrence[attID] = att_name_start;
2303
 
                        #endif
2304
 
                        /* The following optimized tests handle the frequently occurring 
2305
 
                        case that there are no blanks on either side of the equals sign.
2306
 
                        In many cases, the very first test handles 100% of actual
2307
 
                        attribute-value pairs encountered. */
2308
 
                        if (at_EqualsQuote<B::Base>(cur())) Advance(1); 
2309
 
                        else {
2310
 
                                ScanTo(NonWS);
2311
 
                                if (!AtChar<B::Base,'='>(cur())) {
2312
 
                                        Syntax_Error(NT_STag); 
2313
 
                                        break;
2314
 
                                }
2315
 
                                Advance(1); 
2316
 
                                ScanTo(NonWS);
2317
 
                                if (!AtQuote<B::Base>(cur())) {
2318
 
                                        Syntax_Error(NT_STag); 
2319
 
                                        break;
2320
 
                                }
2321
 
                        }
2322
 
                        att_val_start = AbsPos()+1;
2323
 
                        Parse_AttValue();
2324
 
                        att_val_end = AbsPos()-1;
2325
 
                        if (at_xmlns<B::Base>(cur()+att_name_start-AbsPos())) {
2326
 
                                Namespace_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2327
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2328
 
                        }
2329
 
                        else {
2330
 
                                AttributeValue_action(GetCodeUnitPtr(att_name_start), att_name_end - att_name_start,
2331
 
                                                 GetCodeUnitPtr(att_val_start), att_val_end - att_val_start);
2332
 
                        }
2333
 
                        /* Now check for end or repeat. Avoid whitespace scan if possible.*/
2334
 
                        if (AtChar<B::Base,'>'>(cur())) {
2335
 
                                Advance(1);
2336
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2337
 
                                break;
2338
 
                        }
2339
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
2340
 
                                Advance(2);
2341
 
                                is_emptyStartTag = true;        
2342
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2343
 
                                break;
2344
 
                        }
2345
 
                        ScanTo(NonWS);
2346
 
                        if (AtChar<B::Base,'>'>(cur())) {
2347
 
                                Advance(1);
2348
 
                                StartTag_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2349
 
                                break;
2350
 
                        }
2351
 
                        else if (at_EmptyElementDelim<B::Base>(cur())) {
2352
 
                                Advance(2);
2353
 
                                is_emptyStartTag = true;
2354
 
                                EmptyElement_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
2355
 
                                break;
2356
 
                        }
2357
 
                        else if (AbsPos() == att_val_end + 1) { 
2358
 
                                /* No WS following att value */
2359
 
                                Syntax_Error(NT_STag);
2360
 
                                break;
2361
 
                        }
2362
 
                } while (1);
2363
 
        }
2364
 
        return nameID;
2365
 
}
2366
 
 
2367
 
template <class B, WorkingCharacterSet W>
2368
 
int ParsingEngine<B, W>::Parse_ValidElement() {
2369
 
        bool is_emptyStartTag = false;
2370
 
        int nameID = Parse_ValidStartTag(is_emptyStartTag);
2371
 
#ifdef DEBUG
2372
 
        printf("Parse_ValidElement: nameID = %d, name = %s, is_emptyStartTag=%i\n",nameID, Parser_Interface<W>::model_info->symbol_table->Get_UTF8_name(nameID), is_emptyStartTag);
2373
 
#endif
2374
 
        ContentModel * cm = Parser_Interface<W>::model_info->ContentModelData[nameID];
2375
 
        switch (cm->cm_type) {
2376
 
                case cm_Empty:
2377
 
                        if (!is_emptyStartTag) {
2378
 
                                if (at_EndTag_Start<B::Base>(cur())) {
2379
 
                                        Parse_WF_EndTag(nameID);
2380
 
                                }
2381
 
                                else {
2382
 
                                        Validity_Error(vErr_elementvalid);
2383
 
                                }
2384
 
                        }
2385
 
                        break;
2386
 
                case cm_Any:            
2387
 
                        if (!is_emptyStartTag) {
2388
 
                                Parse_AnyContent();
2389
 
                                Parse_WF_EndTag(nameID);
2390
 
                        }
2391
 
                        break;
2392
 
                case cm_Mixed:          
2393
 
                        if (!is_emptyStartTag) {
2394
 
                                Parse_MixedContent(((CM_Mixed *) cm)->elements);
2395
 
                                Parse_WF_EndTag(nameID);
2396
 
                        }
2397
 
                        break;
2398
 
                case cm_RegExp:
2399
 
                        CM_RegExp * cre = (CM_RegExp *) cm;
2400
 
                        int content_state = 0;
2401
 
                        if (!is_emptyStartTag) {
2402
 
                                Parse_ValidContent(cre, content_state);
2403
 
                                #ifdef DEBUG
2404
 
                                printf("Final content_state = %i, nameID = %i\n", content_state, nameID);
2405
 
                                #endif
2406
 
                                Parse_WF_EndTag(nameID);                
2407
 
                        }
2408
 
                        if (cre->transition_map[content_state][0]==0) {
2409
 
                                Validity_Error(vErr_elementvalid);
2410
 
                        }
2411
 
        }
2412
 
        return nameID;
2413
 
}
2414
 
 
2415
 
template <class B, WorkingCharacterSet W>
2416
 
void ParsingEngine<B, W>::Parse_ValidContent(CM_RegExp * cre, int & cur_state) {
2417
 
        do {
2418
 
                ScanTo(NonWS);
2419
 
                /* If non-null report WS  WS_action()? */
2420
 
                text_or_markup_start = AbsPos();
2421
 
                if (at_EndTag_Start<B::Base>(cur())) {
2422
 
                        break;
2423
 
                }
2424
 
                else if (at_ElementTag_Start<B::Base>(cur())) {
2425
 
                        int nameID = Parse_ValidElement();
2426
 
#ifdef DEBUG
2427
 
                        printf("Content model state transition %i", cur_state);
2428
 
#endif
2429
 
                        cur_state = cre->transition_map[cur_state][nameID];
2430
 
#ifdef DEBUG
2431
 
                        printf("-> %i\n", cur_state);
2432
 
#endif
2433
 
                }
2434
 
                else if (at_Comment_Start<B::Base>(cur())) {
2435
 
                        Parse_Comment();
2436
 
                }
2437
 
                else if (at_PI_Start<B::Base>(cur())) {
2438
 
                        Parse_PI();
2439
 
                }
2440
 
                else if (AtChar<B::Base,'&'>(cur())) {
2441
 
                        Parse_ValidEntityRef(cre, cur_state);
2442
 
#ifdef DEBUG
2443
 
                        printf("EntityRef complete, cur_state = %i\n", cur_state);
2444
 
#endif
2445
 
                        
2446
 
                }
2447
 
                else if (at_EOF()) {
2448
 
                        break;
2449
 
                }
2450
 
                else if (AtChar<B::Base,'<'>(cur())) {
2451
 
                        Syntax_Error(NT_markupdecl);
2452
 
                }
2453
 
                else {
2454
 
                        Validity_Error(vErr_elementvalid);
2455
 
                }
2456
 
        } while(1);
2457
 
}
2458
 
 
2459
 
 
2460
 
template <class B, WorkingCharacterSet W>
2461
 
void ParsingEngine<B, W>::Parse_AnyContent() {
2462
 
        do {
2463
 
                text_or_markup_start = AbsPos();
2464
 
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2465
 
                if (at_ElementTag_Start<B::Base>(cur())) {
2466
 
                        text_if_nonnull_action(false);
2467
 
                        int nameID = Parse_ValidElement();
2468
 
                }
2469
 
                else if (at_EndTag_Start<B::Base>(cur())) {
2470
 
                        text_if_nonnull_action(false);
2471
 
                        return;
2472
 
                }
2473
 
                else if (at_Comment_Start<B::Base>(cur())) {
2474
 
                        text_if_nonnull_action(false);
2475
 
                        Parse_Comment();
2476
 
                }
2477
 
                else if (at_CharRef_Start<B::Base>(cur())) {
2478
 
                        text_if_nonnull_action(true);
2479
 
                        Parse_CharRef();
2480
 
                }
2481
 
                else if (AtChar<B::Base,'&'>(cur())) {
2482
 
                        text_if_nonnull_action(true);
2483
 
                        Parse_EntityRef_inAnyContent();
2484
 
                }
2485
 
                else if (at_CDATA_Start<B::Base>(cur())) {
2486
 
                        text_if_nonnull_action(true);
2487
 
                        Parse_CDATA();
2488
 
                }
2489
 
                else if (at_PI_Start<B::Base>(cur())) {
2490
 
                        text_if_nonnull_action(false);
2491
 
                        Parse_PI();
2492
 
                }
2493
 
                else if (at_CDATA_End<B::Base>(cur())) {
2494
 
                        text_if_nonnull_action(true);
2495
 
                        Advance(3);
2496
 
                        Syntax_Error(NT_CharData);
2497
 
                }
2498
 
                else if (at_EOF()) {
2499
 
                        text_if_nonnull_action(false);
2500
 
                        return;
2501
 
                }
2502
 
                else if (AtChar<B::Base,'<'>(cur())) {
2503
 
                        Syntax_Error(NT_markupdecl);
2504
 
                }
2505
 
                else {
2506
 
                        Advance(1);
2507
 
                        continue;
2508
 
                }
2509
 
        } while (1);
2510
 
}
2511
 
template <class B, WorkingCharacterSet W>
2512
 
void ParsingEngine<B, W>::Parse_MixedContent(symbol_set_t elems) {
2513
 
        do {
2514
 
                text_or_markup_start = AbsPos();
2515
 
                ScanTextTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
2516
 
/*              if (AtChar<B::Base,'<'>(cur())) {
2517
 
                        text_if_nonnull_action();
2518
 
                        Parse_Markup<B, W>();
2519
 
                }*/
2520
 
                if (at_ElementTag_Start<B::Base>(cur())) {
2521
 
                        text_if_nonnull_action(false);
2522
 
                        int nameID = Parse_ValidElement();
2523
 
                        if (elems[nameID] == 0) {
2524
 
                                Validity_Error(vErr_elementvalid);
2525
 
                        }
2526
 
                }
2527
 
                else if (at_EndTag_Start<B::Base>(cur())) {
2528
 
                        text_if_nonnull_action(false);
2529
 
                        return;
2530
 
                }
2531
 
                else if (at_Comment_Start<B::Base>(cur())) {
2532
 
                        text_if_nonnull_action(false);
2533
 
                        Parse_Comment();
2534
 
                }
2535
 
                else if (at_CharRef_Start<B::Base>(cur())) {
2536
 
                        text_if_nonnull_action(true);
2537
 
                        Parse_CharRef();
2538
 
                }
2539
 
                else if (AtChar<B::Base,'&'>(cur())) {
2540
 
                        text_if_nonnull_action(true);
2541
 
                        Parse_EntityRef_inMixed(elems);
2542
 
                }
2543
 
                else if (at_CDATA_Start<B::Base>(cur())) {
2544
 
                        text_if_nonnull_action(true);
2545
 
                        Parse_CDATA();
2546
 
                }
2547
 
                else if (at_PI_Start<B::Base>(cur())) {
2548
 
                        text_if_nonnull_action(false);
2549
 
                        Parse_PI();
2550
 
                }
2551
 
                else if (at_CDATA_End<B::Base>(cur())) {
2552
 
                        text_if_nonnull_action(true);
2553
 
                        Advance(3);
2554
 
                        Syntax_Error(NT_CharData);
2555
 
                }
2556
 
                else if (at_EOF()) {
2557
 
                        text_if_nonnull_action(false);
2558
 
                        return;
2559
 
                }
2560
 
                else if (AtChar<B::Base,'<'>(cur())) {
2561
 
                        Syntax_Error(NT_markupdecl);
2562
 
                }
2563
 
                else {
2564
 
                        Advance(1);
2565
 
                        continue;
2566
 
                }
2567
 
        } while (1);
2568
 
}
2569
 
 
2570
 
 
2571
 
template <class B, WorkingCharacterSet W>
2572
 
int ParsingEngine<B, W>::Parse_Name() {
2573
 
        int name_pos = AbsPos();
2574
 
        ScanTo(NameFollow);
2575
 
        int lgth = AbsPos()-name_pos;
2576
 
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Name(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2577
 
        if (nameID != 0) return nameID;
2578
 
        else {
2579
 
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2580
 
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2581
 
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2582
 
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved();
2583
 
        }
2584
 
}
2585
 
 
2586
 
// template <>
2587
 
// int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Name() {
2588
 
//      int name_pos = AbsPos();
2589
 
//      ScanTo(NameFollow);
2590
 
//      int lgth = AbsPos()-name_pos;
2591
 
// //   int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2592
 
// //   if (nameID != 0) return nameID;
2593
 
// //   else {
2594
 
//              int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2595
 
//              char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2596
 
//              byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2597
 
//              return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved();
2598
 
// //   }
2599
 
// }
2600
 
 
2601
 
// template <WorkingCharacterSet W>
2602
 
// inline int ParsingEngine<UTF8_Buffer, W>::Parse_Name() {
2603
 
//      int name_pos = AbsPos();
2604
 
//      ScanTo(NameFollow);
2605
 
//      int lgth = AbsPos()-name_pos;
2606
 
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2607
 
// }
2608
 
 
2609
 
template <>
2610
 
inline int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Name() {
2611
 
        int name_pos = AbsPos();
2612
 
        ScanTo(NameFollow);
2613
 
        int lgth = AbsPos()-name_pos;
2614
 
        return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Name(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2615
 
}
2616
 
 
2617
 
template <class B, WorkingCharacterSet W>
2618
 
int ParsingEngine<B, W>::Parse_Nmtoken() {
2619
 
        int name_pos = AbsPos();
2620
 
        ScanTo(NameFollow);
2621
 
        int lgth = AbsPos()-name_pos;
2622
 
        int nameID = Parser_Interface<W>::model_info->symbol_table->ASCII_Lookup_or_Insert_Nmtoken(&((char *) x8data)[buffer_rel_pos-lgth], lgth);
2623
 
        if (nameID != 0) return nameID;
2624
 
        else {
2625
 
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2626
 
                char * u8_ptr = Parser_Interface<W>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2627
 
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2628
 
                return Parser_Interface<W>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2629
 
        }
2630
 
}
2631
 
 
2632
 
/*template <>
2633
 
int ParsingEngine< X8_Buffer<EBCDIC>, UTF_8 >::Parse_Nmtoken() {
2634
 
        int name_pos = AbsPos();
2635
 
        ScanTo(NameFollow);
2636
 
        int lgth = AbsPos()-name_pos;
2637
 
//      int nameID = local_EBCDIC_table->Lookup_or_Insert(GetCodeUnitPtr(name_pos), lgth);
2638
 
//      if (nameID != 0) return nameID;
2639
 
//      else {
2640
 
                int u8_lgth = byteplex->UTF8_Length(name_pos, lgth);
2641
 
                char * u8_ptr = Parser_Interface<UTF_8>::model_info->symbol_table->ReserveSymbolSpace(u8_lgth);
2642
 
                byteplex->to_UTF8(name_pos, lgth, u8_ptr);
2643
 
                return Parser_Interface<UTF_8>::model_info->symbol_table->LookupOrInsertReserved_nmtoken();
2644
 
//      }
2645
 
}*/
2646
 
// template <>
2647
 
// int ParsingEngine<UTF8_Buffer, UTF_8>::Parse_Nmtoken() {
2648
 
//      int name_pos = AbsPos();
2649
 
//      ScanTo(NameFollow);
2650
 
//      int lgth = AbsPos()-name_pos;
2651
 
//      return Parser_Interface<UTF_8>::model_info->symbol_table->UTF8_Lookup_or_Insert_Nmtoken(&((char *)x8data)[buffer_rel_pos-lgth], lgth);
2652
 
// }
2653
 
 
2654
 
template <class B, WorkingCharacterSet W>
2655
 
void ParsingEngine<B, W>::Parse_DocumentContent() {
2656
 
#if (VALIDATION_MODE == ON)
2657
 
        int cur_state = 0;
2658
 
        Parse_ValidContent(Parser_Interface<W>::model_info->rootModel, cur_state);
2659
 
        if (Parser_Interface<W>::model_info->rootModel->transition_map[cur_state][0]==0) {
2660
 
                Validity_Error(vErr_elementvalid);
2661
 
        }
2662
 
#endif
2663
 
#if (VALIDATION_MODE == OFF)
2664
 
        Parse_WF_Element();
2665
 
        ScanTo(NonWS);
2666
 
        while(at_Comment_Start<B::Base>(cur()) || at_PI_Start<B::Base>(cur()) ){
2667
 
                if (at_Comment_Start<B::Base>(cur()))
2668
 
                        Parse_Comment();
2669
 
                else 
2670
 
                        Parse_PI();
2671
 
                ScanTo(NonWS);
2672
 
        }
2673
 
        if (!at_EOF()) {
2674
 
                Syntax_Error(NT_document);
2675
 
        }       
2676
 
#endif
2677
 
        Parser_Interface<W>::DocumentEnd_action();      
2678
 
}
2679
 
 
2680
 
#ifdef MARKUP_PASS_CONTROL
2681
 
// Test routine as an alternative to MarkupPass.
2682
 
template <class B, WorkingCharacterSet W>
2683
 
void ParsingEngine<B, W>::ParseContent() {
2684
 
        int start_code = 0;
2685
 
        int end_code = 0;
2686
 
        int charref_code = 0;
2687
 
        int general_ref_code = 0;
2688
 
        DocumentStart_action(); 
2689
 
        bool is_emptyStartTag = false;
2690
 
        do {
2691
 
                text_or_markup_start = AbsPos();
2692
 
                ScanTo(MarkupStart); /* '<', '&', or ']' for 0b11']]>' test */
2693
 
/*              if (AtChar<B::Base,'<'>(cur())) {
2694
 
                        text_if_nonnull_action();
2695
 
                        Parse_Markup<B, W>();
2696
 
                }*/
2697
 
                if (at_EndTag_Start<B::Base>(cur())) {
2698
 
                        end_code |= AbsPos();
2699
 
                }
2700
 
                else if (AtChar<B::Base,'<'>(cur())) {
2701
 
                        start_code += AbsPos();
2702
 
                }
2703
 
                else if (at_CharRef_Start<B::Base>(cur())) {
2704
 
                        charref_code += 1;
2705
 
                }
2706
 
                else  if (AtChar<B::Base,'&'>(cur())) {
2707
 
                        general_ref_code += 1;
2708
 
                }
2709
 
                else if (at_EOF()) break;
2710
 
                Advance(1);
2711
 
        } while (1);
2712
 
        printf("Start_code: %i\n", start_code);
2713
 
        printf("End_code: %i\n", end_code);
2714
 
        printf("general_ref_code: %i\n", general_ref_code);
2715
 
        printf("charref_code: %i\n", charref_code);
2716
 
        DocumentEnd_action();   
2717
 
}
2718
 
#endif
2719
 
 
2720
 
#ifdef MARKUP_SORTING
2721
 
// Little endian codes for [&#/] stream.
2722
 
enum MarkupSortCodes {
2723
 
  StartTagTwoBitCode = 0,
2724
 
  EndTagTwoBitCode = 2,
2725
 
  GeneralRefCode = 1,
2726
 
  CharRefCode = 3
2727
 
};
2728
 
 
2729
 
 
2730
 
static inline int GetBitPair(SIMD_type * stream, int bit_posn) {
2731
 
        return bitstream_segment_from(stream, bit_posn) & 3;
2732
 
}
2733
 
 
2734
 
template <class B, WorkingCharacterSet W>
2735
 
void ParsingEngine<B, W>::ParseContent() {
2736
 
/*vector<int> MarkupPositions[4];*/
2737
 
int MarkupPositions[4][BUFFER_SIZE];
2738
 
int MarkupCounts[4];
2739
 
        int start_code = 0;
2740
 
        int end_code = 0;
2741
 
        int charref_code = 0;
2742
 
        int general_ref_code = 0;
2743
 
 
2744
 
        DocumentStart_action(); 
2745
 
        bool is_emptyStartTag = false;
2746
 
                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2747
 
                text_or_markup_start = AbsPos();
2748
 
        do {
2749
 
                unsigned long segment = bitstream_segment_from(buf->item_stream[MarkupStart], buffer_rel_pos);
2750
 
//printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2751
 
                if (segment != 0) {
2752
 
                        buffer_rel_pos += cfzl(segment);
2753
 
                text_or_markup_start = AbsPos();
2754
 
                        int markup_code = GetBitPair(buf->item_stream[AmpHashSlash], buffer_rel_pos);
2755
 
                        MarkupPositions[markup_code][MarkupCounts[markup_code]] = AbsPos();
2756
 
                        MarkupCounts[markup_code]++;
2757
 
                        Advance(1);
2758
 
                }
2759
 
                else {
2760
 
                        buffer_rel_pos = (buffer_rel_pos & -8) + 8*sizeof(unsigned long);
2761
 
// printf("buffer_rel_pos = %i, segment = %x\n", buffer_rel_pos, segment);
2762
 
 
2763
 
                        if (buffer_rel_pos >= buffer_limit_pos) {
2764
 
/*                              for (int i = 0; i < MarkupCounts[StartTagTwoBitCode]; i++) {
2765
 
                                        start_code += MarkupPositions[StartTagTwoBitCode][i];
2766
 
                                }
2767
 
                                for (int i = 0; i < MarkupCounts[EndTagTwoBitCode]; i++) {
2768
 
                                        end_code |= MarkupPositions[EndTagTwoBitCode][i];
2769
 
                                }
2770
 
                                for (int i = 0; i < MarkupCounts[GeneralRefCode]; i++) {
2771
 
                                        general_ref_code += 1;
2772
 
                                }
2773
 
                                for (int i = 0; i < MarkupCounts[CharRefCode]; i++) {
2774
 
                                        charref_code += 1;
2775
 
                                }*/
2776
 
/*      printf("Start_code: %i\n", start_code);
2777
 
        printf("End_code: %i\n", end_code);
2778
 
        printf("general_ref_code: %i\n", general_ref_code);
2779
 
        printf("charref_code: %i\n", charref_code);*/
2780
 
                                for (int i = 0; i < 4; i++) MarkupCounts[i] = 0;
2781
 
                                if (buffer_rel_pos >= BUFFER_SIZE) {
2782
 
                                        AdjustBufferEndForIncompleteSequences();
2783
 
                                        Parser_Interface<W>::FinalizeBuffer_action();
2784
 
                                        AdvanceBuffers();
2785
 
                                }
2786
 
                                else break;
2787
 
                        }
2788
 
                        
2789
 
                }
2790
 
 
2791
 
        } while (1);
2792
 
/*      vector<int>::iterator i;
2793
 
        for (i = MarkupPositions[StartTagTwoBitCode].begin(); i != MarkupPositions[StartTagTwoBitCode].end(); i++) {
2794
 
                start_code += *i;
2795
 
        }
2796
 
        for (i = MarkupPositions[EndTagTwoBitCode].begin(); i != MarkupPositions[EndTagTwoBitCode].end(); i++) {
2797
 
                end_code |= *i;
2798
 
        }
2799
 
        for (i = MarkupPositions[GeneralRefCode].begin(); i != MarkupPositions[GeneralRefCode].end(); i++) {
2800
 
                general_ref_code += 1;
2801
 
        }
2802
 
        for (i = MarkupPositions[CharRefCode].begin(); i != MarkupPositions[CharRefCode].end(); i++) {
2803
 
                charref_code += 1;
2804
 
        }*/
2805
 
        printf("Start_code: %i\n", start_code);
2806
 
        printf("End_code: %i\n", end_code);
2807
 
        printf("general_ref_code: %i\n", general_ref_code);
2808
 
        printf("charref_code: %i\n", charref_code);
2809
 
        DocumentEnd_action();   
2810
 
}
2811
 
 
2812
 
#endif
2813
 
 
2814
 
 
2815
 
 
2816