/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/*  engine.h - parabix parsing engine
    Copyright (c) 2007, 2008 Robert D. Cameron and Dan Lin
    Licensed to the public under the Open Software License 3.0.
    Licensed to International Characters, Inc., under the Academic
    Free License 3.0.

*/
#ifndef ENGINE_H
#define ENGINE_H

/*  Parabix provides strings to the application using the WorkingCharacterSet.
    This parameter is set by template instantiation; normally only one
    WorkingCharacterSet parameter is chosen.  However, it is possible
    to compile different parsers having different WorkingCharacterSet
    parameters within one run-time code base.
*/

enum WorkingCharacterSet {UTF_8, UTF_16, UTF_32};

#include "xmlmodel.h"
#include "xmldecl.h"
#include "byteplex.h"
#include "bitlex.h"
#include "xml_error.h"
#include "contentmodel.h"
#include "symtab.h"

#define min(x,y) ((x) <(y) ?(x) :(y) )
/* A ParsingEngine is the principal class for parsing XML
data.  */

template <WorkingCharacterSet W>
class Parser_Interface {
public:
	virtual ~Parser_Interface();
	static Parser_Interface * ParserFactory(const char * filename);
	static Parser_Interface * ParserFactory(const char * filename, Model_Info * m);
	// Constructor for a subsidiary parser for internal entities.
	static Parser_Interface * ParserFactory(const char * byte_buffer, int byte_count, Entity_Info * e, Model_Info * m);
	virtual void ParseContent() = 0;
	virtual void Parse_DocumentContent() = 0;
	virtual void Parse_WF_Content() = 0;
	virtual void Parse_AnyContent() = 0;
	virtual void Parse_MixedContent(symbol_set_t elems) = 0;
	virtual void Parse_ValidContent(CM_RegExp * cre, int & cur_state) = 0;
	virtual bool at_EOF() const = 0;
	virtual void Parse_ExtSubsetDecl() = 0;
	virtual void Parse_Prolog() = 0;
	bool has_ByteOrderMark();
	XML_version get_version();
	XML_standalone standalone_status();
	bool has_EncodingDecl();
	unsigned char * get_Encoding();
	Model_Info * model_info;
	Entity_Info * entity_Info;


	/* Action routine for document start. */
	void DocumentStart_action();

	/* Action routine for document end. */
	void DocumentEnd_action();

	/* Action routine for an XML comment in "<!--"  "-->" brackets. */
	void Comment_action(unsigned char * item, int lgth);

	/* Action routine called upon recognizing "<![CDATA[" to start a CDATA section. */
	void CDATA_start_action(unsigned char * CDATA_ptr);

	/* Action routine called upon recognizing "]]>" to end a CDATA section. */
	void CDATA_end_action(unsigned char * CDATA_end_ptr);

	/* Action routine for an XML processing instruction enclosed in "<?" and "?>" brackets. */
	void PI_action(unsigned char * item, int lgth);

	/* Action routine for an empty element enclosed in "<" and "/>" brackets. */
	void EmptyElement_action(unsigned char * item, int lgth);

	/* Action routine for a start tag enclosed in "<" and ">" brackets. */
	void StartTag_action(unsigned char * item, int lgth);

	/* Action routine for an end tag enclosed in "</" and ">" brackets. */
	void EndTag_action(unsigned char * item, int lgth);

	/* Action routine for an error item */
	void Error_action(unsigned char * item, int lgth);

	/* Action routine for a text item */
	void Text_action(unsigned char * item, int lgth, bool more);

	/* Action routine for a character or entity reference.*/
	void Reference_action(unsigned char * item, int lgth);

	/* Action routine for an element name occurring immediately after the
	opening "<" of a start tag or empty element tag. */
	void ElementName_action(unsigned char * item, int lgth);

	/* Action routine for a processing instruction target name occurring immediately
	after the opening "<?" of a processing instruction. */
	void PI_Target_action(unsigned char * item, int lgth);

	/* Action routine for an individual attribute/value pair occurring in
	a element start tag or an empty element tag. */
	void AttributeValue_action(unsigned char * name, int name_lgth,
				   unsigned char * val, int val_lgth);

	/* Action routine for an individual namespace binding occurring in
	a element start tag or an empty element tag. */
	void Namespace_action(unsigned char * name, int name_end,
			      unsigned char * URI, int URI_end);

	/*Action routine for end of buffer events. */
	void FinalizeBuffer_action();

	/*Document Type actions.*/
	void Doctype_action(unsigned char * item, int lgth);
	void PEReference_action(unsigned char * item, int lgth);

	void Prolog_action(unsigned char * item, int lgth);

	void ExtSubsetDecl_action(unsigned char * item, int lgth);

};

template <class B, WorkingCharacterSet W>
class ParsingEngine : public Parser_Interface<W> {
public:
	ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external);
	virtual ~ParsingEngine();
	void ParseContent();
protected:
	bool StrictWellFormedness;

	vector<int> LastAttOccurrence;
	XML_Decl_Parser<B::Base> * decl_parser;

	int text_or_markup_start;
	/* Getters for current point/position information. */
	int AbsPos() const;
	int LengthFrom(int start_pos) const;
	int BufferRelPos() const;
	unsigned char * cur() const;
	unsigned char * GetCodeUnitPtr(int pos);

	bool at_EOF () const;
	/*Internal helper for text action*/
	void text_if_nonnull_action(bool more);
	/* Mutators that advance the input. */
	void Advance(int n);
	void ScanTo(int lex_item);
	void ScanTextTo(int lex_item);  // Specialized version.
	void AdjustBufferEndForIncompleteSequences();
	void AdvanceBuffers();
	/* Parsing routines. */

	void WF_Error (XML_Constraint errCode);
	void Validity_Error (XML_Constraint errCode);
	void Syntax_Error (XML_NonTerminal errNT);

	void Parse_Comment ();
	void Parse_StartTag ();
	void Parse_EndTag ();
	void Parse_CDATA ();
	void Parse_PI ();
	void Parse_CharRef ();
	void Parse_EntityRef ();
	void Parse_EntityRef_inMixed(symbol_set_t elems);
	void Parse_EntityRef_inAnyContent();

	/* Parsing routine for Document Type*/
	void Parse_DocType ();
	void Parse_ExternalID (char *& SystemLiteral, char *& PubidLiteral);
	void Parse_SystemLiteral ();
	void Parse_PubidLiteral ();
	void Parse_IntSubset ();
	void Parse_PEReference ();
	void Parse_Elementdecl ();
	ContentModel * Parse_RemainingMixed ();
	Content_RE * Parse_RemainingChildren ();
	Content_RE * Parse_Cp();
	void Parse_AttlistDecl ();
	void Parse_Notation (ATT_info * this_info);
	void Parse_Enumeration (ATT_info * this_info);
	void Parse_DefaultDecl (ATT_info * this_info);
	void Parse_Entitydecl ();
	void Parse_Notationdecl ();
	void requireWS ();
	void Parse_AttValue ();
	void Parse_GEntityValue(GEntity_info * this_info);
	void Parse_PEntityValue(PEntity_info * this_info);
	char * Replace_EntityRef(bool& is_simple);
	char * Replace_CharRef();
	void Parse_Prolog();
	void Parse_DocumentContent();

	void Parse_WF_Element();
	void Parse_WF_Content();
	int Parse_WF_StartTag(bool& is_empty);
	void Parse_WF_EndTag(int nameID);

	void Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state);
	int Parse_ValidElement();
	void Parse_ValidContent(CM_RegExp * cre, int & cur_state);
	void Parse_AnyContent();
	void Parse_MixedContent(symbol_set_t elems);

	int Parse_ValidStartTag(bool& is_empty);

	int Parse_Nmtoken();
	int Parse_Name();

	/*Parsing routine for external entities*/
	void Parse_ExtSubsetDecl ();

protected:
	/* Co-classes */

	Byteplex * byteplex;
	Bitplex * bitplex;
	Lexer_Interface * lexer;
	/* Parallel data streams for current buffer full of XML data. */
	BytePack * x8data;
	LexicalStreamSet * buf;

	int buffer_base_pos;
	int buffer_rel_pos;
	int buffer_limit_pos;
};


#endif