1
/* engine.h - parabix parsing engine
2
Copyright (c) 2007, 2008 Robert D. Cameron and Dan Lin
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
11
/* Parabix provides strings to the application using the WorkingCharacterSet.
12
This parameter is set by template instantiation; normally only one
13
WorkingCharacterSet parameter is chosen. However, it is possible
14
to compile different parsers having different WorkingCharacterSet
15
parameters within one run-time code base.
18
enum WorkingCharacterSet {UTF_8, UTF_16, UTF_32};
24
#include "xml_error.h"
25
#include "contentmodel.h"
28
#define min(x,y) ((x) <(y) ?(x) :(y) )
29
/* A ParsingEngine is the principal class for parsing XML
32
template <WorkingCharacterSet W>
33
class Parser_Interface {
35
virtual ~Parser_Interface();
36
static Parser_Interface * ParserFactory(const char * filename);
37
static Parser_Interface * ParserFactory(const char * filename, Model_Info * m);
38
// Constructor for a subsidiary parser for internal entities.
39
static Parser_Interface * ParserFactory(const char * byte_buffer, int byte_count, Entity_Info * e, Model_Info * m);
40
virtual void ParseContent() = 0;
41
virtual void Parse_DocumentContent() = 0;
42
virtual void Parse_WF_Content() = 0;
43
virtual void Parse_AnyContent() = 0;
44
virtual void Parse_MixedContent(symbol_set_t elems) = 0;
45
virtual void Parse_ValidContent(CM_RegExp * cre, int & cur_state) = 0;
46
virtual bool at_EOF() const = 0;
47
virtual void Parse_ExtSubsetDecl() = 0;
48
virtual void Parse_Prolog() = 0;
49
bool has_ByteOrderMark();
50
XML_version get_version();
51
XML_standalone standalone_status();
52
bool has_EncodingDecl();
53
unsigned char * get_Encoding();
54
Model_Info * model_info;
55
Entity_Info * entity_Info;
58
/* Action routine for document start. */
59
void DocumentStart_action();
61
/* Action routine for document end. */
62
void DocumentEnd_action();
64
/* Action routine for an XML comment in "<!--" "-->" brackets. */
65
void Comment_action(unsigned char * item, int lgth);
67
/* Action routine called upon recognizing "<![CDATA[" to start a CDATA section. */
68
void CDATA_start_action(unsigned char * CDATA_ptr);
70
/* Action routine called upon recognizing "]]>" to end a CDATA section. */
71
void CDATA_end_action(unsigned char * CDATA_end_ptr);
73
/* Action routine for an XML processing instruction enclosed in "<?" and "?>" brackets. */
74
void PI_action(unsigned char * item, int lgth);
76
/* Action routine for an empty element enclosed in "<" and "/>" brackets. */
77
void EmptyElement_action(unsigned char * item, int lgth);
79
/* Action routine for a start tag enclosed in "<" and ">" brackets. */
80
void StartTag_action(unsigned char * item, int lgth);
82
/* Action routine for an end tag enclosed in "</" and ">" brackets. */
83
void EndTag_action(unsigned char * item, int lgth);
85
/* Action routine for an error item */
86
void Error_action(unsigned char * item, int lgth);
88
/* Action routine for a text item */
89
void Text_action(unsigned char * item, int lgth, bool more);
91
/* Action routine for a character or entity reference.*/
92
void Reference_action(unsigned char * item, int lgth);
94
/* Action routine for an element name occurring immediately after the
95
opening "<" of a start tag or empty element tag. */
96
void ElementName_action(unsigned char * item, int lgth);
98
/* Action routine for a processing instruction target name occurring immediately
99
after the opening "<?" of a processing instruction. */
100
void PI_Target_action(unsigned char * item, int lgth);
102
/* Action routine for an individual attribute/value pair occurring in
103
a element start tag or an empty element tag. */
104
void AttributeValue_action(unsigned char * name, int name_lgth,
105
unsigned char * val, int val_lgth);
107
/* Action routine for an individual namespace binding occurring in
108
a element start tag or an empty element tag. */
109
void Namespace_action(unsigned char * name, int name_end,
110
unsigned char * URI, int URI_end);
112
/*Action routine for end of buffer events. */
113
void FinalizeBuffer_action();
115
/*Document Type actions.*/
116
void Doctype_action(unsigned char * item, int lgth);
117
void PEReference_action(unsigned char * item, int lgth);
119
void Prolog_action(unsigned char * item, int lgth);
121
void ExtSubsetDecl_action(unsigned char * item, int lgth);
125
template <class B, WorkingCharacterSet W>
126
class ParsingEngine : public Parser_Interface<W> {
128
ParsingEngine(Entity_Info * e, Model_Info * m, Byteplex * b, bool is_external);
129
virtual ~ParsingEngine();
132
bool StrictWellFormedness;
134
vector<int> LastAttOccurrence;
135
XML_Decl_Parser<B::Base> * decl_parser;
137
int text_or_markup_start;
138
/* Getters for current point/position information. */
140
int LengthFrom(int start_pos) const;
141
int BufferRelPos() const;
142
unsigned char * cur() const;
143
unsigned char * GetCodeUnitPtr(int pos);
145
bool at_EOF () const;
146
/*Internal helper for text action*/
147
void text_if_nonnull_action(bool more);
148
/* Mutators that advance the input. */
150
void ScanTo(int lex_item);
151
void ScanTextTo(int lex_item); // Specialized version.
152
void AdjustBufferEndForIncompleteSequences();
153
void AdvanceBuffers();
154
/* Parsing routines. */
156
void WF_Error (XML_Constraint errCode);
157
void Validity_Error (XML_Constraint errCode);
158
void Syntax_Error (XML_NonTerminal errNT);
160
void Parse_Comment ();
161
void Parse_StartTag ();
162
void Parse_EndTag ();
165
void Parse_CharRef ();
166
void Parse_EntityRef ();
167
void Parse_EntityRef_inMixed(symbol_set_t elems);
168
void Parse_EntityRef_inAnyContent();
170
/* Parsing routine for Document Type*/
171
void Parse_DocType ();
172
void Parse_ExternalID (char *& SystemLiteral, char *& PubidLiteral);
173
void Parse_SystemLiteral ();
174
void Parse_PubidLiteral ();
175
void Parse_IntSubset ();
176
void Parse_PEReference ();
177
void Parse_Elementdecl ();
178
ContentModel * Parse_RemainingMixed ();
179
Content_RE * Parse_RemainingChildren ();
180
Content_RE * Parse_Cp();
181
void Parse_AttlistDecl ();
182
void Parse_Notation (ATT_info * this_info);
183
void Parse_Enumeration (ATT_info * this_info);
184
void Parse_DefaultDecl (ATT_info * this_info);
185
void Parse_Entitydecl ();
186
void Parse_Notationdecl ();
188
void Parse_AttValue ();
189
void Parse_GEntityValue(GEntity_info * this_info);
190
void Parse_PEntityValue(PEntity_info * this_info);
191
char * Replace_EntityRef(bool& is_simple);
192
char * Replace_CharRef();
194
void Parse_DocumentContent();
196
void Parse_WF_Element();
197
void Parse_WF_Content();
198
int Parse_WF_StartTag(bool& is_empty);
199
void Parse_WF_EndTag(int nameID);
201
void Parse_ValidEntityRef(CM_RegExp * cre, int & cur_state);
202
int Parse_ValidElement();
203
void Parse_ValidContent(CM_RegExp * cre, int & cur_state);
204
void Parse_AnyContent();
205
void Parse_MixedContent(symbol_set_t elems);
207
int Parse_ValidStartTag(bool& is_empty);
212
/*Parsing routine for external entities*/
213
void Parse_ExtSubsetDecl ();
220
Lexer_Interface * lexer;
221
/* Parallel data streams for current buffer full of XML data. */
223
LexicalStreamSet * buf;
227
int buffer_limit_pos;