Public Types | Static Public Attributes | List of all members
moot::TokenReaderExpat Class Reference

Experimental XML reader class using expat.

Inheritance diagram for moot::TokenReaderExpat:
Inheritance graph
[legend]
Collaboration diagram for moot::TokenReaderExpat:
Collaboration graph
[legend]

Public Types

enum  xmlNodeFlags {
  TRX_Default = 0x00000000, TRX_IsOuter = 0x00000001, TRX_IsRoot = 0x00000002, TRX_IsBodyE = 0x00000004,
  TRX_IsBodyD = 0x00000008, TRX_IsTokenE = 0x00000010, TRX_IsTokenD = 0x00000020, TRX_IsTokTextE = 0x00000040,
  TRX_IsTokTextD = 0x00000080, TRX_IsAnalysisE = 0x00000100, TRX_IsAnalysisD = 0x00000200, TRX_IsBestTagE = 0x00000400,
  TRX_IsBestTagD = 0x00000800, TRX_All = 0xffffffff
}
 
typedef slist< int > NodeInfoStack
 

Public Member Functions

Constructors and Such
 TokenReaderExpat (int fmt=tiofXML, size_t buflen=8192, const std::string &encoding="", const std::string &name="TokenReaderExpat")
 
virtual ~TokenReaderExpat (void)
 
virtual void reset (void)
 
TokenReader Overrides : Input Selection
virtual void reader_name (const std::string &myname)
 
virtual void close (void)
 
virtual void from_mstream (mootio::mistream *mistreamp)
 
virtual void from_mstream (mootio::mistream &mis)
 
virtual void from_filename (const char *filename)
 
virtual void from_file (FILE *infile)
 
virtual void from_fd (int fd)
 
virtual void from_buffer (const void *buf, size_t len)
 
virtual void from_cxxstream (std::istream &is)
 
TokenReader Overrides : Input
virtual mootTokenType get_token (void)
 
virtual mootTokenType get_sentence (void)
 
XML Utilities
bool ensure_cb_fullsents (void)
 
int next_node_info (int emptyStackValue=TRX_IsOuter, int inheritanceMask=defaultNodeInheritanceMask)
 
int top_node_info (int emptyStackValue=TRX_IsOuter)
 
void save_context (mootTokenType toktype=TokTypeXMLRaw, int info=0)
 
void save_context_data (const mootio::micbuffer &buf, mootTokenType toktype=TokTypeXMLRaw, int info=0)
 
void save_context_data (const char *text, size_t len, mootTokenType toktype=TokTypeXMLRaw, int info=0)
 
expat handlers
virtual void XmlDeclHandler (const XML_Char *version, const XML_Char *encoding, int standalone)
 
virtual void StartElementHandler (const char *el, const char **attr)
 
virtual void EndElementHandler (const char *el)
 
virtual void CharacterDataHandler (const XML_Char *s, int len)
 
virtual void CommentHandler (const XML_Char *s)
 
virtual void DefaultHandler (const XML_Char *s, int len)
 
Error reporting
virtual size_t line_number (void)
 
virtual size_t line_number (size_t n)
 
virtual size_t column_number (void)
 
virtual size_t column_number (size_t n)
 
virtual mootio::ByteOffset byte_number (void)
 
virtual mootio::ByteOffset byte_number (mootio::ByteOffset n)
 
virtual void carp (char *fmt,...)
 
- Public Member Functions inherited from moot::TokenReader
 TokenReader (int fmt=tiofUnknown, const std::string &name="TokenReader")
 
virtual ~TokenReader (void)
 
void tr_clear (void)
 
virtual void from_string (const char *s)
 
virtual bool opened (void)
 
mootTokentoken (void)
 
mootSentencesentence (void)
 
virtual mootio::ByteOffset byte_number (size_t n)
 
virtual void carp (const char *fmt,...)
 
- Public Member Functions inherited from moot::mootExpatParser
 mootExpatParser (size_t bufsize=8192, const std::string &encoding="")
 
virtual void setEncoding (const std::string &encoding="")
 
virtual ~mootExpatParser (void)
 
virtual void from_mstream (mootio::mistream *mistreamp, bool autoclose=false)
 
virtual void from_mstream (mootio::mistream &mistream, bool autoclose=false)
 
virtual void from_string (const char *s)
 
virtual bool parse_check (void)
 
bool parse_filename (const std::string &filename)
 
bool parse_file (FILE *infile=stdin, const std::string &in_name="")
 
bool parse_buffer (const char *buf, size_t buflen)
 
bool parse_all (mootio::mistream *in=__null)
 
bool parse_chunk (int &nbytes, int &is_final, mootio::mistream *in=__null)
 
virtual void context_dump (FILE *tofile=__null)
 
virtual std::string context_string (void)
 
virtual void carp (const char *fmt,...)
 
virtual void xpcarp (const char *fmt,...)
 
long int line_number (void)
 
int column_number (void)
 
long byte_offset (void)
 
int byte_count (void)
 
virtual void ProcessingInstructionHandler (const XML_Char *s, const XML_Char *target, const XML_Char *data)
 
virtual void StartCdataSectionHandler (void)
 
virtual void EndCdataSectionHandler (void)
 

Public Attributes

Search Parameters
bool save_raw_xml
 whether to store raw XML along with 'normal' tokens (default=false) More...
 
std::string body_elt
 Name of 'body' element. Default="" (tokenize everything) More...
 
std::string eos_elt
 Name of 'eos' (end-)element (sentence boundary). Default="eos". More...
 
std::string token_elt
 Name of 'token' element. Default="token". More...
 
std::string text_elt
 Name of 'text' element (descendant of 'token'). Default="text". More...
 
std::string analysis_elt
 Name of 'analysis' element (descendant of 'token'). Default="analysis". More...
 
std::string postag_attr
 Name of 'pos tag' attribute (of 'analysis' elt) Default="pos". More...
 
std::string besttag_elt
 Name of 'best tag' element (descendant of 'token'). Default="moot.tag". More...
 
std::string location_elt
 Name of 'location' element (descendant of 'token'). Default="moot.loc". More...
 
std::string offset_attr
 Name of 'location' element 'offset' attribute. Default="offset". More...
 
std::string length_attr
 Name of 'location' element 'length' attribute. Default="length". More...
 
Internal Data
NodeInfoStack stack
 Node-information stack. More...
 
int done
 true iff we've parsed doc to eof More...
 
mootSentence cb_nxtsent
 Sentence construction buffer for expat callbacks. More...
 
mootTokencb_nxttok
 Construction buffer for tokens (points into nxtsent) More...
 
mootSentence cb_fullsents
 LONG buffer of fully parsed sentences (for expat callbacks) More...
 
mootSentence trx_sentbuf
 current output sentence buffer (for TokenReader interface) More...
 
- Public Attributes inherited from moot::TokenReader
int tr_format
 
std::string tr_name
 
mootio::mistreamtr_istream
 
bool tr_istream_created
 
mootTokentr_token
 
mootSentencetr_sentence
 
void * tr_data
 
- Public Attributes inherited from moot::mootExpatParser
mootio::mistreamxp_istream
 Current input stream. More...
 
bool xp_istream_created
 whether input stream mis was created locally More...
 
char * xml_buf
 Parse buffer for expat parser. More...
 
size_t xml_buflen
 Allocated size of xml_buf. More...
 
std::string xml_encoding
 Input encoding override (goofy) More...
 
XML_Parser parser
 The underlying expat parser object. More...
 

Static Public Attributes

static const int defaultNodeInheritanceMask = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD
 
- Static Public Attributes inherited from moot::TokenReader
static const size_t TR_DEFAULT_BUFSIZE = 256
 

Additional Inherited Members

- Static Public Member Functions inherited from moot::TokenIO
static int parse_format_string (const std::string &fmtString)
 
static int guess_filename_format (const char *filename)
 
static bool is_empty_format (int fmt)
 
static int sanitize_format (int fmt, int fmt_implied=tiofNone, int fmt_default=tiofNone)
 
static int parse_format_request (const char *request, const char *filename=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone)
 
static std::string format_canonical_string (int fmt)
 
static class TokenReadernew_reader (int fmt)
 
static class TokenWriternew_writer (int fmt)
 
static class TokenReaderfile_reader (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone)
 
static class TokenWriterfile_writer (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone)
 
static size_t pipe_tokens (class TokenReader *reader, class TokenWriter *writer)
 
static size_t pipe_sentences (class TokenReader *reader, class TokenWriter *writer)
 
- Static Public Member Functions inherited from moot::mootExpatParser
static void _xp_XmlDeclHandler (mootExpatParser *mp, const XML_Char *version, const XML_Char *encoding, int standalone)
 
static void _xp_StartElementHandler (mootExpatParser *mp, const char *el, const char **attr)
 
static void _xp_EndElementHandler (mootExpatParser *mp, const char *el)
 
static void _xp_CharacterDataHandler (mootExpatParser *mp, const XML_Char *s, int len)
 
static void _xp_CommentHandler (mootExpatParser *mp, const XML_Char *s)
 
static void _xp_DefaultHandler (mootExpatParser *mp, const XML_Char *s, int len)
 

Member Typedef Documentation

◆ NodeInfoStack

Node information stack: each element is a bitmask of xmlNodeFlags

Member Enumeration Documentation

◆ xmlNodeFlags

Enum for parser node-information stack elements: use these constants to create bitmasks.

Enumerator
TRX_Default 

nothing special about this node

TRX_IsOuter 

extra-document data

TRX_IsRoot 

document root node

TRX_IsBodyE 

body node

TRX_IsBodyD 

(indirect) daughter of a body node

TRX_IsTokenE 

token node

TRX_IsTokenD 

(indirect) daughter of a token node

TRX_IsTokTextE 

token-text node

TRX_IsTokTextD 

(indirect) daughter of a token-text node

TRX_IsAnalysisE 

token-analysis node

TRX_IsAnalysisD 

(indirect) daughter of an analysis node

TRX_IsBestTagE 

'best tag' node

TRX_IsBestTagD 

(indirect) daughter of a 'best tag' node

TRX_All 

all possible flags (useful for masking)

Constructor & Destructor Documentation

◆ TokenReaderExpat()

moot::TokenReaderExpat::TokenReaderExpat ( int  fmt = tiofXML,
size_t  buflen = 8192,
const std::string &  encoding = "",
const std::string &  name = "TokenReaderExpat" 
)
inline

Default constructor:

Parameters
fmtbitmask of moot::TokenIOFormatE flags
buflenlength of parse buffer for expat
encodingoverride document encoding (broken?)
namesymbolic name for this reader used for error reporting, etc.

References moot::tiofConserve.

◆ ~TokenReaderExpat()

virtual moot::TokenReaderExpat::~TokenReaderExpat ( void  )
inlinevirtual

Default destructor

Member Function Documentation

◆ reset()

virtual void moot::TokenReaderExpat::reset ( void  )
virtual

Reset parser state

Reimplemented from moot::mootExpatParser.

◆ reader_name()

virtual void moot::TokenReaderExpat::reader_name ( const std::string &  myname)
inlinevirtual

Declare subtype name to use for diagnostics.

Reimplemented from moot::TokenReader.

References moot::TokenReader::reader_name().

◆ close()

virtual void moot::TokenReaderExpat::close ( void  )
virtual

Close currently selected input source.

Reimplemented from moot::mootExpatParser.

◆ from_mstream() [1/2]

virtual void moot::TokenReaderExpat::from_mstream ( mootio::mistream mistreamp)
inlinevirtual

Select input from a mootio::mistream pointer. This is the basic case. Descendendant classes may want to override this method.

Reimplemented from moot::TokenReader.

References moot::mootExpatParser::from_mstream(), and moot::TokenReader::from_mstream().

◆ from_mstream() [2/2]

virtual void moot::TokenReaderExpat::from_mstream ( mootio::mistream mis)
inlinevirtual

Select input from a mootio::mistream object, reference version. Default implementation just calls from_mstream(&mis).

Reimplemented from moot::TokenReader.

References moot::mootExpatParser::from_mstream(), and moot::TokenReader::from_mstream().

◆ from_filename()

virtual void moot::TokenReaderExpat::from_filename ( const char *  filename)
inlinevirtual

Select input from a named file. Descendants using named file input may override this method. The filename "-" may be used to specify stdin. Default implementation calls from_mstream().

Reimplemented from moot::mootExpatParser.

References moot::TokenReader::from_filename(), and moot::mootExpatParser::from_mstream().

◆ from_file()

virtual void moot::TokenReaderExpat::from_file ( FILE *  file)
inlinevirtual

Select input from a C stream. Caller is responsible for opening and closing the stream. Descendants using C stream input may override this method. Default implementation calls from_mstream().

Reimplemented from moot::mootExpatParser.

References moot::TokenReader::from_file(), and moot::mootExpatParser::from_mstream().

◆ from_fd()

virtual void moot::TokenReaderExpat::from_fd ( int  fd)
inlinevirtual

Select input from a file descriptor. Caller is responsible for opening and closing the stream. Descendants using file descriptor input may override this method. No default implementation.

Reimplemented from moot::mootExpatParser.

References moot::TokenReader::from_fd(), and moot::mootExpatParser::from_mstream().

◆ from_buffer()

virtual void moot::TokenReaderExpat::from_buffer ( const void *  buf,
size_t  len 
)
inlinevirtual

Select input from a C memory-buffer. Caller is responsible for allocation and de-allocation. Descendants using C memory-buffer input may override this method. Default implementation calls from_mstream().

Reimplemented from moot::mootExpatParser.

References moot::TokenReader::from_buffer(), and moot::mootExpatParser::from_mstream().

◆ from_cxxstream()

virtual void moot::TokenReaderExpat::from_cxxstream ( std::istream &  is)
inlinevirtual

Select input from a C++ stream. Caller is responsible for allocation and de-allocation. Descendants using C++ stream input may override this method. Default implementation calls from_mstream().

Reimplemented from moot::mootExpatParser.

References moot::TokenReader::from_cxxstream(), and moot::mootExpatParser::from_mstream().

◆ get_token()

virtual mootTokenType moot::TokenReaderExpat::get_token ( void  )
virtual

Get the next token from the buffer. On completion, current token (if any) is in *tr_token.

Warning
subsequent calls to get_token(), get_sentence() etc may invalidate the pointer

Reimplemented from moot::TokenReader.

◆ get_sentence()

virtual mootTokenType moot::TokenReaderExpat::get_sentence ( void  )
virtual

Read in next sentence. On completion, current sentence (if any) is in *tr_sentence.

Warning
subsequent calls to get_token(), get_sentence() etc may invalidate the pointer

Reimplemented from moot::TokenReader.

◆ ensure_cb_fullsents()

bool moot::TokenReaderExpat::ensure_cb_fullsents ( void  )

Ensure that there is some data in the callback sentence buffer, possibly parsing another chunk of the document. If more data is read, tr_token is reset to NULL.

Returns false iff no more data is available in cb_fullsents.

◆ next_node_info()

int moot::TokenReaderExpat::next_node_info ( int  emptyStackValue = TRX_IsOuter,
int  inheritanceMask = defaultNodeInheritanceMask 
)
inline

Predict node information for the next node by inheritance-masking

◆ top_node_info()

int moot::TokenReaderExpat::top_node_info ( int  emptyStackValue = TRX_IsOuter)
inline

Get node information for the parent node (top of the stack)

◆ save_context()

void moot::TokenReaderExpat::save_context ( mootTokenType  toktype = TokTypeXMLRaw,
int  info = 0 
)
inline

Save current parser context as a mootToken to the callback sentence buffer.

References moot::TokTypeXMLRaw.

◆ save_context_data() [1/2]

void moot::TokenReaderExpat::save_context_data ( const mootio::micbuffer buf,
mootTokenType  toktype = TokTypeXMLRaw,
int  info = 0 
)
inline

Save a mootToken to the callback sentence buffer, micbuffer version

References mootio::micbuffer::cb_offset, mootio::micbuffer::cb_rdata, mootio::micbuffer::cb_used, and moot::TokTypeXMLRaw.

◆ save_context_data() [2/2]

void moot::TokenReaderExpat::save_context_data ( const char *  text,
size_t  len,
mootTokenType  toktype = TokTypeXMLRaw,
int  info = 0 
)

Save a mootToken to the callback sentence buffer, string version

◆ XmlDeclHandler()

virtual void moot::TokenReaderExpat::XmlDeclHandler ( const XML_Char *  version,
const XML_Char *  encoding,
int  standalone 
)
virtual

Handle XML declarations

Reimplemented from moot::mootExpatParser.

◆ StartElementHandler()

virtual void moot::TokenReaderExpat::StartElementHandler ( const char *  el,
const char **  attr 
)
virtual

Handle start elements

Reimplemented from moot::mootExpatParser.

◆ EndElementHandler()

virtual void moot::TokenReaderExpat::EndElementHandler ( const char *  el)
virtual

Handle end elements

Reimplemented from moot::mootExpatParser.

◆ CharacterDataHandler()

virtual void moot::TokenReaderExpat::CharacterDataHandler ( const XML_Char *  s,
int  len 
)
virtual

Handle character data

Reimplemented from moot::mootExpatParser.

◆ CommentHandler()

virtual void moot::TokenReaderExpat::CommentHandler ( const XML_Char *  s)
virtual

Handle comments

Reimplemented from moot::mootExpatParser.

◆ DefaultHandler()

virtual void moot::TokenReaderExpat::DefaultHandler ( const XML_Char *  s,
int  len 
)
virtual

Handle any other document-internal data (no entity expansion!)

Reimplemented from moot::mootExpatParser.

◆ line_number() [1/2]

virtual size_t moot::TokenReaderExpat::line_number ( void  )
inlinevirtual

Get current line number.

Reimplemented from moot::TokenReader.

◆ line_number() [2/2]

virtual size_t moot::TokenReaderExpat::line_number ( size_t  n)
inlinevirtual

Set current line number – not implemented.

Reimplemented from moot::TokenReader.

References line_number().

Referenced by line_number().

◆ column_number() [1/2]

virtual size_t moot::TokenReaderExpat::column_number ( void  )
inlinevirtual

Get current column number.

Reimplemented from moot::TokenReader.

◆ column_number() [2/2]

virtual size_t moot::TokenReaderExpat::column_number ( size_t  n)
inlinevirtual

Set current column number.

Reimplemented from moot::TokenReader.

References column_number().

Referenced by column_number().

◆ byte_number() [1/2]

virtual mootio::ByteOffset moot::TokenReaderExpat::byte_number ( void  )
inlinevirtual

Get current byte number.

Reimplemented from moot::TokenReader.

◆ byte_number() [2/2]

virtual mootio::ByteOffset moot::TokenReaderExpat::byte_number ( mootio::ByteOffset  n)
inlinevirtual

Set current byte number.

References byte_number().

Referenced by byte_number().

◆ carp()

virtual void moot::TokenReaderExpat::carp ( char *  fmt,
  ... 
)
virtual

complain

Member Data Documentation

◆ defaultNodeInheritanceMask

const int moot::TokenReaderExpat::defaultNodeInheritanceMask = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD
static

Default node-inheritance flags

◆ save_raw_xml

bool moot::TokenReaderExpat::save_raw_xml

◆ body_elt

std::string moot::TokenReaderExpat::body_elt

◆ eos_elt

std::string moot::TokenReaderExpat::eos_elt

◆ token_elt

std::string moot::TokenReaderExpat::token_elt

◆ text_elt

std::string moot::TokenReaderExpat::text_elt

◆ analysis_elt

std::string moot::TokenReaderExpat::analysis_elt

◆ postag_attr

std::string moot::TokenReaderExpat::postag_attr

◆ besttag_elt

std::string moot::TokenReaderExpat::besttag_elt

◆ location_elt

std::string moot::TokenReaderExpat::location_elt

◆ offset_attr

std::string moot::TokenReaderExpat::offset_attr

◆ length_attr

std::string moot::TokenReaderExpat::length_attr

◆ stack

NodeInfoStack moot::TokenReaderExpat::stack

◆ done

int moot::TokenReaderExpat::done

◆ cb_nxtsent

mootSentence moot::TokenReaderExpat::cb_nxtsent

◆ cb_nxttok

mootToken* moot::TokenReaderExpat::cb_nxttok

◆ cb_fullsents

mootSentence moot::TokenReaderExpat::cb_fullsents

◆ trx_sentbuf

mootSentence moot::TokenReaderExpat::trx_sentbuf

The documentation for this class was generated from the following file: