Experimental XML reader class using expat.
Public Types | |
enum | xmlNodeFlags { TRX_Default = 0x00000000, TRX_IsOuter = 0x00000001, TRX_IsRoot = 0x00000002, TRX_IsBodyE = 0x00000004, TRX_IsBodyD = 0x00000008, TRX_IsTokenE = 0x00000010, TRX_IsTokenD = 0x00000020, TRX_IsTokTextE = 0x00000040, TRX_IsTokTextD = 0x00000080, TRX_IsAnalysisE = 0x00000100, TRX_IsAnalysisD = 0x00000200, TRX_IsBestTagE = 0x00000400, TRX_IsBestTagD = 0x00000800, TRX_All = 0xffffffff } |
typedef slist< int > | NodeInfoStack |
Public Member Functions | |
Constructors and Such | |
TokenReaderExpat (int fmt=tiofXML, size_t buflen=8192, const std::string &encoding="", const std::string &name="TokenReaderExpat") | |
virtual | ~TokenReaderExpat (void) |
virtual void | reset (void) |
TokenReader Overrides : Input Selection | |
virtual void | reader_name (const std::string &myname) |
virtual void | close (void) |
virtual void | from_mstream (mootio::mistream *mistreamp) |
virtual void | from_mstream (mootio::mistream &mis) |
virtual void | from_filename (const char *filename) |
virtual void | from_file (FILE *infile) |
virtual void | from_fd (int fd) |
virtual void | from_buffer (const void *buf, size_t len) |
virtual void | from_cxxstream (std::istream &is) |
TokenReader Overrides : Input | |
virtual mootTokenType | get_token (void) |
virtual mootTokenType | get_sentence (void) |
XML Utilities | |
bool | ensure_cb_fullsents (void) |
int | next_node_info (int emptyStackValue=TRX_IsOuter, int inheritanceMask=defaultNodeInheritanceMask) |
int | top_node_info (int emptyStackValue=TRX_IsOuter) |
void | save_context (mootTokenType toktype=TokTypeXMLRaw, int info=0) |
void | save_context_data (const mootio::micbuffer &buf, mootTokenType toktype=TokTypeXMLRaw, int info=0) |
void | save_context_data (const char *text, size_t len, mootTokenType toktype=TokTypeXMLRaw, int info=0) |
expat handlers | |
virtual void | XmlDeclHandler (const XML_Char *version, const XML_Char *encoding, int standalone) |
virtual void | StartElementHandler (const char *el, const char **attr) |
virtual void | EndElementHandler (const char *el) |
virtual void | CharacterDataHandler (const XML_Char *s, int len) |
virtual void | CommentHandler (const XML_Char *s) |
virtual void | DefaultHandler (const XML_Char *s, int len) |
Error reporting | |
virtual size_t | line_number (void) |
virtual size_t | line_number (size_t n) |
virtual size_t | column_number (void) |
virtual size_t | column_number (size_t n) |
virtual mootio::ByteOffset | byte_number (void) |
virtual mootio::ByteOffset | byte_number (mootio::ByteOffset n) |
virtual void | carp (char *fmt,...) |
Public Member Functions inherited from moot::TokenReader | |
TokenReader (int fmt=tiofUnknown, const std::string &name="TokenReader") | |
virtual | ~TokenReader (void) |
void | tr_clear (void) |
virtual void | from_string (const char *s) |
virtual bool | opened (void) |
mootToken * | token (void) |
mootSentence * | sentence (void) |
virtual mootio::ByteOffset | byte_number (size_t n) |
virtual void | carp (const char *fmt,...) |
Public Member Functions inherited from moot::mootExpatParser | |
mootExpatParser (size_t bufsize=8192, const std::string &encoding="") | |
virtual void | setEncoding (const std::string &encoding="") |
virtual | ~mootExpatParser (void) |
virtual void | from_mstream (mootio::mistream *mistreamp, bool autoclose=false) |
virtual void | from_mstream (mootio::mistream &mistream, bool autoclose=false) |
virtual void | from_string (const char *s) |
virtual bool | parse_check (void) |
bool | parse_filename (const std::string &filename) |
bool | parse_file (FILE *infile=stdin, const std::string &in_name="") |
bool | parse_buffer (const char *buf, size_t buflen) |
bool | parse_all (mootio::mistream *in=__null) |
bool | parse_chunk (int &nbytes, int &is_final, mootio::mistream *in=__null) |
virtual void | context_dump (FILE *tofile=__null) |
virtual std::string | context_string (void) |
virtual void | carp (const char *fmt,...) |
virtual void | xpcarp (const char *fmt,...) |
long int | line_number (void) |
int | column_number (void) |
long | byte_offset (void) |
int | byte_count (void) |
virtual void | ProcessingInstructionHandler (const XML_Char *s, const XML_Char *target, const XML_Char *data) |
virtual void | StartCdataSectionHandler (void) |
virtual void | EndCdataSectionHandler (void) |
Public Attributes | |
Search Parameters | |
bool | save_raw_xml |
whether to store raw XML along with 'normal' tokens (default=false) More... | |
std::string | body_elt |
Name of 'body' element. Default="" (tokenize everything) More... | |
std::string | eos_elt |
Name of 'eos' (end-)element (sentence boundary). Default="eos". More... | |
std::string | token_elt |
Name of 'token' element. Default="token". More... | |
std::string | text_elt |
Name of 'text' element (descendant of 'token'). Default="text". More... | |
std::string | analysis_elt |
Name of 'analysis' element (descendant of 'token'). Default="analysis". More... | |
std::string | postag_attr |
Name of 'pos tag' attribute (of 'analysis' elt) Default="pos". More... | |
std::string | besttag_elt |
Name of 'best tag' element (descendant of 'token'). Default="moot.tag". More... | |
std::string | location_elt |
Name of 'location' element (descendant of 'token'). Default="moot.loc". More... | |
std::string | offset_attr |
Name of 'location' element 'offset' attribute. Default="offset". More... | |
std::string | length_attr |
Name of 'location' element 'length' attribute. Default="length". More... | |
Internal Data | |
NodeInfoStack | stack |
Node-information stack. More... | |
int | done |
true iff we've parsed doc to eof More... | |
mootSentence | cb_nxtsent |
Sentence construction buffer for expat callbacks. More... | |
mootToken * | cb_nxttok |
Construction buffer for tokens (points into nxtsent) More... | |
mootSentence | cb_fullsents |
LONG buffer of fully parsed sentences (for expat callbacks) More... | |
mootSentence | trx_sentbuf |
current output sentence buffer (for TokenReader interface) More... | |
Public Attributes inherited from moot::TokenReader | |
int | tr_format |
std::string | tr_name |
mootio::mistream * | tr_istream |
bool | tr_istream_created |
mootToken * | tr_token |
mootSentence * | tr_sentence |
void * | tr_data |
Public Attributes inherited from moot::mootExpatParser | |
mootio::mistream * | xp_istream |
Current input stream. More... | |
bool | xp_istream_created |
whether input stream mis was created locally More... | |
char * | xml_buf |
Parse buffer for expat parser. More... | |
size_t | xml_buflen |
Allocated size of xml_buf . More... | |
std::string | xml_encoding |
Input encoding override (goofy) More... | |
XML_Parser | parser |
The underlying expat parser object. More... | |
Static Public Attributes | |
static const int | defaultNodeInheritanceMask = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD |
Static Public Attributes inherited from moot::TokenReader | |
static const size_t | TR_DEFAULT_BUFSIZE = 256 |
Additional Inherited Members | |
Static Public Member Functions inherited from moot::TokenIO | |
static int | parse_format_string (const std::string &fmtString) |
static int | guess_filename_format (const char *filename) |
static bool | is_empty_format (int fmt) |
static int | sanitize_format (int fmt, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static int | parse_format_request (const char *request, const char *filename=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static std::string | format_canonical_string (int fmt) |
static class TokenReader * | new_reader (int fmt) |
static class TokenWriter * | new_writer (int fmt) |
static class TokenReader * | file_reader (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static class TokenWriter * | file_writer (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static size_t | pipe_tokens (class TokenReader *reader, class TokenWriter *writer) |
static size_t | pipe_sentences (class TokenReader *reader, class TokenWriter *writer) |
Static Public Member Functions inherited from moot::mootExpatParser | |
static void | _xp_XmlDeclHandler (mootExpatParser *mp, const XML_Char *version, const XML_Char *encoding, int standalone) |
static void | _xp_StartElementHandler (mootExpatParser *mp, const char *el, const char **attr) |
static void | _xp_EndElementHandler (mootExpatParser *mp, const char *el) |
static void | _xp_CharacterDataHandler (mootExpatParser *mp, const XML_Char *s, int len) |
static void | _xp_CommentHandler (mootExpatParser *mp, const XML_Char *s) |
static void | _xp_DefaultHandler (mootExpatParser *mp, const XML_Char *s, int len) |
typedef slist<int> moot::TokenReaderExpat::NodeInfoStack |
Node information stack: each element is a bitmask of xmlNodeFlags
Enum for parser node-information stack elements: use these constants to create bitmasks.
|
inline |
Default constructor:
fmt | bitmask of moot::TokenIOFormatE flags |
buflen | length of parse buffer for expat |
encoding | override document encoding (broken?) |
name | symbolic name for this reader used for error reporting, etc. |
References moot::tiofConserve.
|
inlinevirtual |
Default destructor
|
virtual |
Reset parser state
Reimplemented from moot::mootExpatParser.
|
inlinevirtual |
Declare subtype name to use for diagnostics.
Reimplemented from moot::TokenReader.
References moot::TokenReader::reader_name().
|
virtual |
Close currently selected input source.
Reimplemented from moot::mootExpatParser.
|
inlinevirtual |
Select input from a mootio::mistream pointer. This is the basic case. Descendendant classes may want to override this method.
Reimplemented from moot::TokenReader.
References moot::mootExpatParser::from_mstream(), and moot::TokenReader::from_mstream().
|
inlinevirtual |
Select input from a mootio::mistream object, reference version. Default implementation just calls from_mstream(&mis).
Reimplemented from moot::TokenReader.
References moot::mootExpatParser::from_mstream(), and moot::TokenReader::from_mstream().
|
inlinevirtual |
Select input from a named file. Descendants using named file input may override this method. The filename "-" may be used to specify stdin. Default implementation calls from_mstream().
Reimplemented from moot::mootExpatParser.
References moot::TokenReader::from_filename(), and moot::mootExpatParser::from_mstream().
|
inlinevirtual |
Select input from a C stream. Caller is responsible for opening and closing the stream. Descendants using C stream input may override this method. Default implementation calls from_mstream().
Reimplemented from moot::mootExpatParser.
References moot::TokenReader::from_file(), and moot::mootExpatParser::from_mstream().
|
inlinevirtual |
Select input from a file descriptor. Caller is responsible for opening and closing the stream. Descendants using file descriptor input may override this method. No default implementation.
Reimplemented from moot::mootExpatParser.
References moot::TokenReader::from_fd(), and moot::mootExpatParser::from_mstream().
|
inlinevirtual |
Select input from a C memory-buffer. Caller is responsible for allocation and de-allocation. Descendants using C memory-buffer input may override this method. Default implementation calls from_mstream().
Reimplemented from moot::mootExpatParser.
References moot::TokenReader::from_buffer(), and moot::mootExpatParser::from_mstream().
|
inlinevirtual |
Select input from a C++ stream. Caller is responsible for allocation and de-allocation. Descendants using C++ stream input may override this method. Default implementation calls from_mstream().
Reimplemented from moot::mootExpatParser.
References moot::TokenReader::from_cxxstream(), and moot::mootExpatParser::from_mstream().
|
virtual |
Get the next token from the buffer. On completion, current token (if any) is in *tr_token.
Reimplemented from moot::TokenReader.
|
virtual |
Read in next sentence. On completion, current sentence (if any) is in *tr_sentence.
Reimplemented from moot::TokenReader.
bool moot::TokenReaderExpat::ensure_cb_fullsents | ( | void | ) |
Ensure that there is some data in the callback sentence buffer, possibly parsing another chunk of the document. If more data is read, tr_token is reset to NULL.
Returns false iff no more data is available in cb_fullsents.
|
inline |
Predict node information for the next node by inheritance-masking
|
inline |
Get node information for the parent node (top of the stack)
|
inline |
Save current parser context as a mootToken to the callback sentence buffer.
References moot::TokTypeXMLRaw.
|
inline |
Save a mootToken to the callback sentence buffer, micbuffer version
References mootio::micbuffer::cb_offset, mootio::micbuffer::cb_rdata, mootio::micbuffer::cb_used, and moot::TokTypeXMLRaw.
void moot::TokenReaderExpat::save_context_data | ( | const char * | text, |
size_t | len, | ||
mootTokenType | toktype = TokTypeXMLRaw , |
||
int | info = 0 |
||
) |
Save a mootToken to the callback sentence buffer, string version
|
virtual |
Handle XML declarations
Reimplemented from moot::mootExpatParser.
|
virtual |
Handle start elements
Reimplemented from moot::mootExpatParser.
|
virtual |
Handle end elements
Reimplemented from moot::mootExpatParser.
|
virtual |
Handle character data
Reimplemented from moot::mootExpatParser.
|
virtual |
Handle comments
Reimplemented from moot::mootExpatParser.
|
virtual |
Handle any other document-internal data (no entity expansion!)
Reimplemented from moot::mootExpatParser.
|
inlinevirtual |
Get current line number.
Reimplemented from moot::TokenReader.
|
inlinevirtual |
Set current line number – not implemented.
Reimplemented from moot::TokenReader.
References line_number().
Referenced by line_number().
|
inlinevirtual |
Get current column number.
Reimplemented from moot::TokenReader.
|
inlinevirtual |
Set current column number.
Reimplemented from moot::TokenReader.
References column_number().
Referenced by column_number().
|
inlinevirtual |
Get current byte number.
Reimplemented from moot::TokenReader.
|
inlinevirtual |
|
virtual |
complain
|
static |
Default node-inheritance flags
bool moot::TokenReaderExpat::save_raw_xml |
std::string moot::TokenReaderExpat::body_elt |
std::string moot::TokenReaderExpat::eos_elt |
std::string moot::TokenReaderExpat::token_elt |
std::string moot::TokenReaderExpat::text_elt |
std::string moot::TokenReaderExpat::analysis_elt |
std::string moot::TokenReaderExpat::postag_attr |
std::string moot::TokenReaderExpat::besttag_elt |
std::string moot::TokenReaderExpat::location_elt |
std::string moot::TokenReaderExpat::offset_attr |
std::string moot::TokenReaderExpat::length_attr |
NodeInfoStack moot::TokenReaderExpat::stack |
int moot::TokenReaderExpat::done |
mootSentence moot::TokenReaderExpat::cb_nxtsent |
mootToken* moot::TokenReaderExpat::cb_nxttok |
mootSentence moot::TokenReaderExpat::cb_fullsents |
mootSentence moot::TokenReaderExpat::trx_sentbuf |