/xmlbench/trunk : revision 2

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090922/src/byteplex.h

Committer: Suren A. Chilingaryan
Date: 2009-09-23 17:13:04 UTC
Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd

Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

files added:
create/intel_dom2

create/intel_dom2.java

parse/intel_dom2

parse/intel_dom2.java

parse/intel_sax2

parse/intel_sax2.java

parse/parabix.20090922

parse/parabix.20090922/lib

parse/parabix.20090922/lib/altivec_simd.h

parse/parabix.20090922/lib/lib_simd.h

parse/parabix.20090922/lib/sse_simd.h

parse/parabix.20090922/lib/sse_simd_t.h

parse/parabix.20090922/lib/stdint.h

parse/parabix.20090922/src

parse/parabix.20090922/src/bitlex.c

parse/parabix.20090922/src/bitlex.h

parse/parabix.20090922/src/bitplex.c

parse/parabix.20090922/src/bitplex.h

parse/parabix.20090922/src/bytelex.h

parse/parabix.20090922/src/byteplex.c

parse/parabix.20090922/src/byteplex.h

parse/parabix.20090922/src/charsets

parse/parabix.20090922/src/charsets/ASCII_EBCDIC.h

parse/parabix.20090922/src/charsets/ASCII_EBCDIC.py

parse/parabix.20090922/src/contentmodel.c

parse/parabix.20090922/src/contentmodel.h

parse/parabix.20090922/src/engine.c

parse/parabix.20090922/src/engine.h

parse/parabix.20090922/src/multiliteral.h

parse/parabix.20090922/src/namechars.h

parse/parabix.20090922/src/stringpool.h

parse/parabix.20090922/src/symtab.c

parse/parabix.20090922/src/symtab.h

parse/parabix.20090922/src/xml_chars.py

parse/parabix.20090922/src/xml_error.c

parse/parabix.20090922/src/xml_error.h

parse/parabix.20090922/src/xmldecl.c

parse/parabix.20090922/src/xmldecl.h

parse/parabix.20090922/src/xmlmodel.c

parse/parabix.20090922/src/xmlmodel.h

parse/phobos-dom.d

parse/phobos-sax.d

parse/rapidxml.cpp

parse/tango-dom.d

parse/tango-pull.d

parse/tango-sax.d

parse/tools.c

parse/tools.d

results.memory

results.memory/xsl.odt.expat-sablotron-dom.dummy

results.memory/xsl.odt.expat-sablotron-dom.parse

results.memory/xsl.odt.expat-sablotron-dom.real

results.memory/xsl.odt.intel-dom.dummy

results.memory/xsl.odt.intel-dom.parse

results.memory/xsl.odt.intel-dom.real

results.memory/xsl.odt.libxml.dummy

results.memory/xsl.odt.libxml.parse

results.memory/xsl.odt.libxml.real

results.memory/xsl.odt.oracle-dom.dummy

results.memory/xsl.odt.oracle-dom.parse

results.memory/xsl.odt.oracle-dom.real

results.memory/xsl.odt.sun_dom2.dummy

results.memory/xsl.odt.sun_dom2.parse

results.memory/xsl.odt.sun_dom2.real

results.memory/xsl.odt.xerces-dom.dummy

results.memory/xsl.odt.xerces-dom.parse

results.memory/xsl.odt.xerces-dom.real

results.memory/xsl.rdf.expat-sablotron-dom.dummy

results.memory/xsl.rdf.expat-sablotron-dom.parse

results.memory/xsl.rdf.expat-sablotron-dom.real

results.memory/xsl.rdf.intel-dom.dummy

results.memory/xsl.rdf.intel-dom.parse

results.memory/xsl.rdf.intel-dom.real

results.memory/xsl.rdf.libxml.dummy

results.memory/xsl.rdf.libxml.parse

results.memory/xsl.rdf.libxml.real

results.memory/xsl.rdf.oracle-dom.dummy

results.memory/xsl.rdf.oracle-dom.parse

results.memory/xsl.rdf.oracle-dom.real

results.memory/xsl.rdf.sun_dom2.dummy

results.memory/xsl.rdf.sun_dom2.parse

results.memory/xsl.rdf.sun_dom2.real

results.memory/xsl.rdf.xerces-dom.dummy

results.memory/xsl.rdf.xerces-dom.parse

results.memory/xsl.rdf.xerces-dom.real

results.memory/xsl.xmlgen4.expat-sablotron-dom.dummy

results.memory/xsl.xmlgen4.expat-sablotron-dom.parse

results.memory/xsl.xmlgen4.expat-sablotron-dom.real

results.memory/xsl.xmlgen4.intel-dom.dummy

results.memory/xsl.xmlgen4.intel-dom.parse

results.memory/xsl.xmlgen4.intel-dom.real

results.memory/xsl.xmlgen4.libxml.dummy

results.memory/xsl.xmlgen4.libxml.parse

results.memory/xsl.xmlgen4.libxml.real

results.memory/xsl.xmlgen4.oracle-dom.dummy

results.memory/xsl.xmlgen4.oracle-dom.parse

results.memory/xsl.xmlgen4.oracle-dom.real

results.memory/xsl.xmlgen4.sun_dom2.dummy

results.memory/xsl.xmlgen4.sun_dom2.parse

results.memory/xsl.xmlgen4.sun_dom2.real

results.memory/xsl.xmlgen4.xerces-dom.dummy

results.memory/xsl.xmlgen4.xerces-dom.parse

results.memory/xsl.xmlgen4.xerces-dom.real

results.memory/xsl.xmlgen64M.expat-sablotron-dom.dummy

results.memory/xsl.xmlgen64M.expat-sablotron-dom.parse

results.memory/xsl.xmlgen64M.expat-sablotron-dom.real

results.memory/xsl.xmlgen64M.intel-dom.dummy

results.memory/xsl.xmlgen64M.intel-dom.parse

results.memory/xsl.xmlgen64M.intel-dom.real

results.memory/xsl.xmlgen64M.libxml.dummy

results.memory/xsl.xmlgen64M.libxml.parse

results.memory/xsl.xmlgen64M.libxml.real

results.memory/xsl.xmlgen64M.oracle-dom.dummy

results.memory/xsl.xmlgen64M.oracle-dom.parse

results.memory/xsl.xmlgen64M.oracle-dom.real

results.memory/xsl.xmlgen64M.sun_dom2.dummy

results.memory/xsl.xmlgen64M.sun_dom2.parse

results.memory/xsl.xmlgen64M.sun_dom2.real

results.memory/xsl.xmlgen64M.xerces-dom.dummy

results.memory/xsl.xmlgen64M.xerces-dom.parse

results.memory/xsl.xmlgen64M.xerces-dom.real

results.memsum

results.memsum/parse.xmlgen256.xerces-dom.dummy

results.memsum/parse.xmlgen256.xerces-dom.real

results.memsum/xsl.xmlgen256.libxml.dummy

results.memsum/xsl.xmlgen256.libxml.parse

results.memsum/xsl.xmlgen256.libxml.real

results.memsum/xsl.xmlgen256.xerces-dom.dummy

results.memsum/xsl.xmlgen256.xerces-dom.parse

results.memsum/xsl.xmlgen256.xerces-dom.real

results.memsum/xsl.xmlgen8192.libxml.dummy

results.memsum/xsl.xmlgen8192.libxml.parse

results.memsum/xsl.xmlgen8192.libxml.real

runbench.memory

runbench.memsum

sample.results/2009.02.16/gcc/results.tcmalloc

validate/intel_dom2

validate/intel_dom2.java

xsl/intel_dom2

xsl/intel_dom2.java

files removed:
create/intel-dom

parse/asmxml

parse/intel-dom

parse/intel-sax

parse/parabix

parse/parabix.20090211

parse/parabix.20090211/lib

parse/parabix.20090211/lib/altivec_simd.h

parse/parabix.20090211/lib/lib_simd.h

parse/parabix.20090211/lib/sse_simd.h

parse/parabix.20090211/lib/sse_simd_t.h

parse/parabix.20090211/lib/stdint.h

parse/parabix.20090211/src

parse/parabix.20090211/src/bitlex.c

parse/parabix.20090211/src/bitlex.h

parse/parabix.20090211/src/bitplex.c

parse/parabix.20090211/src/bitplex.h

parse/parabix.20090211/src/bytelex.h

parse/parabix.20090211/src/byteplex.c

parse/parabix.20090211/src/byteplex.h

parse/parabix.20090211/src/charsets

parse/parabix.20090211/src/charsets/ASCII_EBCDIC.h

parse/parabix.20090211/src/contentmodel.c

parse/parabix.20090211/src/contentmodel.h

parse/parabix.20090211/src/engine.c

parse/parabix.20090211/src/engine.h

parse/parabix.20090211/src/multiliteral.h

parse/parabix.20090211/src/namechars.h

parse/parabix.20090211/src/stringpool.h

parse/parabix.20090211/src/symtab.c

parse/parabix.20090211/src/symtab.h

parse/parabix.20090211/src/xml_chars.py

parse/parabix.20090211/src/xml_error.c

parse/parabix.20090211/src/xml_error.h

parse/parabix.20090211/src/xmldecl.c

parse/parabix.20090211/src/xmldecl.h

parse/parabix.20090211/src/xmlmodel.c

parse/parabix.20090211/src/xmlmodel.h

validate/intel-dom

validate/xerces-sax.cpp

xsl/html.intel

xsl/html.intel.lint

xsl/html.libxml

xsl/html.libxml.lint

xsl/intel-dom

xsl/xerces-sax.cpp

files modified:
.bzrignore

Makefile.in

README

ToDo

create/Makefile

create/expat-cslxml-dom.cpp

create/expat-sablotron-dom.cpp

create/intel-dom.cpp

create/libxml-dom.c

create/libxml.c

create/oracle-dom.c

create/oracle-dom.cpp

create/oracle_dom2

create/qt-dom.cpp

create/sun_dom2

create/sun_dom2.java

create/tools.h

create/xerces-dom.cpp

parse/Makefile

parse/oracle_dom2

parse/oracle_sax2

parse/sun_dom2

parse/sun_dom2.java

parse/sun_sax2

parse/sun_sax2.java

parse/tools.h

parse/xerces-dom.cpp

runbench.1

runbench.compile

security/libxml.c

security/libxml1.c

security/sun_dom2

security/tools.h

security/xerces-dom.cpp

validate/Makefile

validate/bench.java

validate/intel-dom.cpp

validate/libxml.c

validate/sun_dom2

validate/sun_dom2.java

validate/tools.h

validate/tools2.h

validate/xerces-dom.cpp

xmlgen/xmlgen.h

xsl/Makefile

xsl/bench.java

xsl/expat-sablotron-dom.cpp

xsl/intel-dom.cpp

xsl/libxml.c

xsl/oracle-dom.c

xsl/sun_dom2

xsl/tools.h

xsl/tools2.h

xsl/xerces-dom.cpp

Show diffs side-by-side

added added

removed removed

parse/parabix.20090922/src/byteplex.h

/* byteplex.h - Parallel byte stream module.

Licensed to the public under the Open Software License 3.0.

Licensed to International Characters, Inc., under the Academic

Free License 3.0.

This module has as its goal the buffering of XML byte data and

transformation of 16-bit and 32-bit code unit data so that the

parsing engine is provided a uniform representation based on

the concept of an 8-bit pseudo-ASCII representation (x8data).

A Byteplex object provides buffers for one to six parallel data

streams based for an XML input entity, depending on the size of

character code units.

1. In the case of 8-bit code units, a single byte stream

consisting of unmodified input data is maintained.

x8data = src_buffer

2. In the case of 16-bit code units (UTF-16 and UCS-2 families),

(a) the original code unit stream is maintained unmodified,

(b) the x16hi byte stream is established for the high byte

of each code unit,

of each code unit, and

(d) x8data is established as the pseudo-ASCII byte stream,

with ASCII code units having their proper 8-bit values,

and all others having bit 0 set to 1.

3. In the case of 32-bit code units (UTF-32 family),

(a) the original code unit stream is maintained unmodified,

(b) the x32hh byte stream has high bytes of each code unit

(d) the x32lh byte stream has third bytes of each code unit

(e) the x32hh byte stream has low bytes of each code unit, and

(f) x8data is established as the pseudo-ASCII byte stream,

with ASCII code units having their proper 8-bit values,

and all others having bit 0 set to 1.

The pseudo-ASCII representation is defined for both ASCII-based

and EBCDIC-based character sets such that all characters in

the ASCII repertoire (i.e., having Unicode code points from 0x00

to 0x7F), are represented as themselves and no non-ASCII character

is represented as a character in the ASCII repertoire.

#ifndef BYTEPLEX_H

#define BYTEPLEX_H

#include "xmldecl.h"

#include "../lib/lib_simd.h"

/* The BytePack and the BitBlock are the two fundamental

types used by the parabix program for data held in

SIMD registers, representing, respectively, the byte-oriented

and bit-oriented views of character data.*/

typedef SIMD_type BytePack;

typedef SIMD_type BitBlock;

const int PACKSIZE = sizeof(SIMD_type);

const int BLOCKSIZE = sizeof(SIMD_type) * 8;

/* Define the size of buffer used for lexical analysis/parsing. */

const int BUFFER_BLOCKS = 781;

const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;

/* When working near the end of a buffer, a bytespace test may involve

a multibyte literal. The bytespace buffer must always make available

a number of lookahead bytes at least equal to the maximum length of any

such literal. */

const int LOOKAHEAD_POSITIONS = 16;

const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;

class Byteplex {

public:

virtual ~Byteplex();

static Byteplex * ByteplexFactory(Entity_Info * e);

static Byteplex * ByteplexFactory(Entity_Info * e, FILE * inputfile);

static Byteplex * ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size);

virtual void DoByteplex() = 0;

virtual void PreparePseudoASCII_Stream() = 0;

virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;

virtual void AdvanceInputBuffer(int advance_amt) = 0;

virtual int UTF8_Length(int name_pos, int lgth)=0;

virtual void to_UTF8(int name_pos, int lgth, char * u8_ptr)=0;

/* Source code unit buffer. */

BytePack * src_buffer;

int units_in_buffer;

/* Pseudo-ASCII stream. */

BytePack * x8data;

protected:

FILE * infile;

int packs_in_buffer;

int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);

void Set_limits(int units_in_buffer);

};

100

101

/* The X8_Buffer template class is used for either ASCII- or EBCDIC-

102

based 8-bit code units.

103

The X8_Buffer<ASCII> class includes 7-bit ASCII

104

(with high-order bit 0), the ISO-8859 character sets and UTF-8.

105

106

The family of 8-bit EBCDIC based character sets are processed using

107

the X8_Buffer<EBCDIC> class.

108

109

110

template <CodeUnit_Base C>

111

class X8_Buffer : public Byteplex {

112

public:

113

static const CodeUnit_Base Base = C;

114

static const CodeUnit_Size Size = SingleByte;

115

X8_Buffer();

116

virtual ~X8_Buffer();

117

118

void DoByteplex();

119

void PreparePseudoASCII_Stream();

120

void AdvanceInputBuffer(int advance_amt);

121

void InitializeBuffer(unsigned char * src, int lgth);

122

int UTF8_Length(int name_pos, int lgth);

123

void to_UTF8(int name_pos, int lgth, char * u8_ptr);

124

};

125

126

class UTF8_Buffer : public Byteplex {

127

public:

128

static const CodeUnit_Base Base = ASCII;

129

static const CodeUnit_Size Size = SingleByte;

130

UTF8_Buffer();

131

virtual ~UTF8_Buffer();

132

133

void DoByteplex();

134

void PreparePseudoASCII_Stream();

135

void AdvanceInputBuffer(int advance_amt);

136

void InitializeBuffer(unsigned char * src, int lgth);

137

int UTF8_Length(int name_pos, int lgth);

138

void to_UTF8(int name_pos, int lgth, char * u8_ptr);

139

};

140

141

142

/* UTF-16 and UCS-2 character set families in BE and LE byte orders.

143

The U16LE and U16BE subclasses each provide a distinct byteplexer to

144

produce 2 parallel byte streams for the high and low bytes of each

145

16-bit code unit. Once byteplexing is complete, a generic pseudoASCII

146

conversion routine can be applied at the U16_Buffer level. */

147

148

class U16_Buffer : public Byteplex {

149

public:

150

static const CodeUnit_Base Base = ASCII;

151

static const CodeUnit_Size Size = DoubleByte;

152

U16_Buffer();

153

virtual ~U16_Buffer();

154

virtual void DoByteplex() = 0;

155

void PreparePseudoASCII_Stream();

156

void AdvanceInputBuffer(int advance_amt);

157

void Validate_UTF16();

158

void Validate_UCS2();

159

void InitializeBuffer(unsigned char * src, int lgth);

160

int UTF8_Length(int name_pos, int lgth);

161

void to_UTF8(int name_pos, int lgth, char * u8_ptr);

162

protected:

163

BytePack * x16hi;

164

BytePack * x16lo;

165

};

166

167

class U16LE_Buffer : public U16_Buffer {

168

public:

169

U16LE_Buffer();

170

void DoByteplex();

171

};

172

173

class U16BE_Buffer : public U16_Buffer {

174

public:

175

U16BE_Buffer();

176

void DoByteplex();

177

};

178

179

180

/* UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.

181

Each subclass of U32_Buffer provide a distinct byteplexer to

182

produce the 4 parallel byte streams of Unicode data. Once

183

byteplexing is complete, a generic pseudoASCII routine can

184

be applied. */

185

class U32_Buffer : public Byteplex {

186

public:

187

static const CodeUnit_Base Base = ASCII;

188

static const CodeUnit_Size Size = QuadByte;

189

U32_Buffer();

190

virtual ~U32_Buffer();

191

virtual void DoByteplex() = 0;

192

void PreparePseudoASCII_Stream();

193

void AdvanceInputBuffer(int advance_amt);

194

void Validate_UTF32();

195

void InitializeBuffer(unsigned char * src, int lgth);

196

int UTF8_Length(int name_pos, int lgth);

197

void to_UTF8(int name_pos, int lgth, char * u8_ptr);

198

protected:

199

BytePack * x32hh;

200

BytePack * x32hl;

201

BytePack * x32lh;

202

BytePack * x32ll;

203

};

204

205

class U32LE_Buffer : public U32_Buffer {

206

public:

207

U32LE_Buffer();

208

void DoByteplex();

209

};

210

211

class U32BE_Buffer : public U32_Buffer {

212

public:

213

U32BE_Buffer();

214

void DoByteplex();

215

};

216

217

class U32_2143_Buffer : public U32_Buffer {

218

public:

219

U32_2143_Buffer();

220

void DoByteplex();

221

};

222

223

class U32_3412_Buffer : public U32_Buffer {

224

public:

225

U32_3412_Buffer();

226

void DoByteplex();

227

};

228

229

230

inline char * copy_name (char * s, int lgth){

231

char * d = new char[lgth+1];

232

memcpy(d, s,lgth);

233

d[lgth] = '\0';

234

return d;

235

}

236

237

#endif

238

239

Older »