mootTokenExpatIO.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2013 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*--------------------------------------------------------------------------
23  * File: mootTokenExpatIO.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * + moocow's PoS tagger : token I/O : XML: Expat
27  *--------------------------------------------------------------------------*/
28 
34 #ifndef _MOOT_TOKEN_EXPAT_IO_H
35 #define _MOOT_TOKEN_EXPAT_IO_H
36 
37 #include <mootConfig.h>
38 
39 #ifdef MOOT_EXPAT_ENABLED
40 
41 #include <mootTokenIO.h>
42 #include <mootExpatParser.h>
43 #include <mootRecode.h>
44 
45 #ifdef __GNUC__
46 # include <ext/slist>
47 #else
48 # include <slist>
49 #endif
50 
51 /*#define MOOT_DEBUG_EXPAT*/
52 
53 moot_BEGIN_NAMESPACE
54 
55 using namespace std;
56 
57 /*======================================================================
58  * mootTokenExpatIO
59  *======================================================================*/
60 
61 /*--------------------------------------------------------------------------
62  * TokenReaderExpat
63  */
65 class TokenReaderExpat : public TokenReader, public mootExpatParser {
66 public:
67  /*----------------------------------------------------
68  * TokenReaderExpat: Types
69  */
74  typedef enum {
75  TRX_Default = 0x00000000,
76  TRX_IsOuter = 0x00000001,
77  TRX_IsRoot = 0x00000002,
78  TRX_IsBodyE = 0x00000004,
79  TRX_IsBodyD = 0x00000008,
80  TRX_IsTokenE = 0x00000010,
81  TRX_IsTokenD = 0x00000020,
82  TRX_IsTokTextE = 0x00000040,
83  TRX_IsTokTextD = 0x00000080,
84  TRX_IsAnalysisE = 0x00000100,
85  TRX_IsAnalysisD = 0x00000200,
86  TRX_IsBestTagE = 0x00000400,
87  TRX_IsBestTagD = 0x00000800,
88  TRX_All = 0xffffffff
89  } xmlNodeFlags;
90 
92  const static int defaultNodeInheritanceMask
93  = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD;
94 
96  //typedef list<int> NodeInfoStack;
97  typedef slist<int> NodeInfoStack;
98 
99 public:
100  /*----------------------------------------------------
101  * TokenReaderExpat: Data
102  */
103  //----------------------------
104  /* I/O Behavior */
105 
106  //----------------------------
109  bool save_raw_xml;
111  std::string body_elt;
112  std::string eos_elt;
113  std::string token_elt;
114  std::string text_elt;
115  std::string analysis_elt;
116  std::string postag_attr;
117  std::string besttag_elt;
118  std::string location_elt;
119  std::string offset_attr;
120  std::string length_attr;
121 
122 
123  //----------------------------
126  //-- position tracking
127  NodeInfoStack stack;
128  int done;
130  //-- construction buffers
131  mootSentence cb_nxtsent;
132  mootToken *cb_nxttok;
134  //-- output buffers
135  mootSentence cb_fullsents;
136  mootSentence trx_sentbuf;
137 
138 
139  public:
140  /*----------------------------------------------------
141  * TokenReaderExpat: Constructor
142  */
145 
152  TokenReaderExpat(int fmt =tiofXML,
153  size_t buflen =MOOT_DEFAULT_EXPAT_BUFLEN,
154  //size_t buflen =128, //-- DEBUG
155  const std::string &encoding ="",
156  const std::string &name ="TokenReaderExpat")
157  : TokenReader(fmt,name),
158  mootExpatParser(buflen,encoding),
159  save_raw_xml(false),
160  body_elt(""),
161  eos_elt("eos"),
162  token_elt("token"),
163  text_elt("text"),
164  analysis_elt("analysis"),
165  postag_attr("pos"),
166  besttag_elt("moot.tag"),
167  location_elt("moot.loc"),
168  offset_attr("offset"),
169  length_attr("length"),
170  done(1)
171  {
172  //-- TokenReader pointers
173  tr_sentence = &trx_sentbuf;
174  tr_token = NULL;
175 
176  save_raw_xml = tr_format & tiofConserve;
177  };
178 
179  /*----------------------------------------------------
180  * TokenReaderExpat: Destructor
181  */
183  virtual ~TokenReaderExpat(void) {};
184 
185  /*----------------------------------------------------
186  * TokenReaderExpat: Reset
187  */
189  virtual void reset(void);
191 
192  /*----------------------------------------------------*/
195 
199  virtual void reader_name(const std::string &myname)
200  {
201  TokenReader::reader_name(myname);
202  //mootExpatParser::setSrcName(myname);
203  };
204 
206  virtual void close(void);
208  virtual void from_mstream(mootio::mistream *mistreamp) {
209  TokenReader::from_mstream(mistreamp);
210  mootExpatParser::from_mstream(tr_istream,false);
211  done = 0;
212  };
213  virtual void from_mstream(mootio::mistream &mis) {
214  TokenReader::from_mstream(mis);
215  mootExpatParser::from_mstream(tr_istream,false);
216  done = 0;
217  };
218  virtual void from_filename(const char *filename) {
219  TokenReader::from_filename(filename);
220  mootExpatParser::from_mstream(tr_istream,false);
221  };
222  virtual void from_file(FILE *infile) {
223  TokenReader::from_file(infile);
224  mootExpatParser::from_mstream(tr_istream,false);
225  };
226  virtual void from_fd(int fd) {
227  TokenReader::from_fd(fd);
228  mootExpatParser::from_mstream(tr_istream,false);
229  };
230  virtual void from_buffer(const void *buf, size_t len) {
231  TokenReader::from_buffer(buf,len);
232  mootExpatParser::from_mstream(tr_istream,false);
233  };
234  virtual void from_cxxstream(std::istream &is) {
235  TokenReader::from_cxxstream(is);
236  mootExpatParser::from_mstream(tr_istream,false);
237  };
239 
240  /*----------------------------------------------------*/
242 
243 
249  virtual mootTokenType get_token(void);
250 
256  virtual mootTokenType get_sentence(void);
258 
259  /*----------------------------------------------------*/
262 
263  /*----------------------------------------------------
264  * TokenReaderExpat: XML Utilities
265  */
273  bool ensure_cb_fullsents(void);
274 
276  inline int next_node_info(int emptyStackValue=TRX_IsOuter,
277  int inheritanceMask=defaultNodeInheritanceMask)
278  {
279  return (stack.empty()
280  ? emptyStackValue
281  : (stack.front() & inheritanceMask));
282  };
283 
285  inline int top_node_info(int emptyStackValue=TRX_IsOuter)
286  {
287  return stack.empty() ? emptyStackValue : stack.front();
288  };
289 
290 
291 #ifdef MOOT_DEBUG_EXPAT
292 
295  void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0);
296 
300  void save_context_data(const mootio::micbuffer &buf, mootTokenType toktype=TokTypeXMLRaw, int info=0);
301 
302 #else
303 
307  inline void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0)
308  {
309  if (!save_raw_xml && toktype == TokTypeXMLRaw) return;
310  if (!info) info = top_node_info();
311  ContextBuffer ctb(parser);
312  save_context_data(ctb, toktype, info);
313  };
314 
318  inline void save_context_data(const mootio::micbuffer &buf,
320  int info=0)
321  {
322  save_context_data(buf.cb_rdata + buf.cb_offset,
323  buf.cb_used - buf.cb_offset,
324  toktype, info);
325  };
326 #endif /* MOOT_DEBUG_EXPAT */
327 
331  void save_context_data(const char *text, size_t len,
333  int info=0);
335 
336  /*----------------------------------------------------
337  * TokenReaderExpat: Expat Handlers
338  */
341  virtual void XmlDeclHandler(const XML_Char *version,
342  const XML_Char *encoding,
343  int standalone);
344  virtual void StartElementHandler(const char *el, const char **attr);
345  virtual void EndElementHandler(const char *el);
346  virtual void CharacterDataHandler(const XML_Char *s, int len);
347  virtual void CommentHandler(const XML_Char *s);
348  virtual void DefaultHandler(const XML_Char *s, int len);
350 
351  /*----------------------------------------------------*/
355  virtual size_t line_number(void) {
356  return parser ? static_cast<size_t>(XML_GetCurrentLineNumber(parser)) : 0;
357  };
358 
360  virtual size_t line_number(size_t n) { return line_number(); };
361 
363  virtual size_t column_number(void) {
364  return parser ? static_cast<size_t>(XML_GetCurrentLineNumber(parser)) : 0;
365  };
366 
368  virtual size_t column_number(size_t n) { return column_number(); };
369 
371  virtual mootio::ByteOffset byte_number(void) {
372  return parser ? static_cast<mootio::ByteOffset>(XML_GetCurrentByteIndex(parser)) : 0;
373  };
374 
376  virtual mootio::ByteOffset byte_number(mootio::ByteOffset n) { return byte_number(); };
377 
379  virtual void carp(char *fmt, ...);
381 };
382 
383 moot_END_NAMESPACE
384 
385 /*#endif // moot_EXPAT_ENABLED*/
386 
387 
388 moot_BEGIN_NAMESPACE
389 
390 /*======================================================================
391  * WRITER
392  *======================================================================*/
393 
394 /*--------------------------------------------------------------------------
395  * TokenWriterExpat
396  */
401 class TokenWriterExpat : public TokenWriter {
402 public:
403  /*----------------------------------------------------
404  * TokenWriterExpat: Data
405  */
406  //----------------------------
407  /* I/O Behavior */
408 
409  //----------------------------
412 
434  bool use_raw_xml;
435 
436  std::string root_elt;
437  std::string eos_elt;
438  std::string token_elt;
439  std::string text_elt;
440  std::string analysis_elt;
441  std::string postag_attr;
442  std::string besttag_elt;
443  std::string location_elt;
444  std::string offset_attr;
445  std::string length_attr;
446 
447 
448  //----------------------------
452  std::string twx_encoding;
453 
455  mootXMLRecoder twx_recoder;
456 
458  int lastc;
460 
461  public:
462  /*----------------------------------------------------
463  * TokenReaderExpat: Constructor
464  */
467 
475  TokenWriterExpat(int fmt =tiofXML
476  , bool got_raw_xml =false
477  , const std::string &encoding =""
478  , const std::string &name ="TokenWriterExpat"
479  );
480 
481  /*----------------------------------------------------
482  * TokenWriterExpat: encoding
483  */
485  inline void setEncoding(const std::string &encoding="")
486  {
487  twx_encoding = encoding;
488  twx_recoder.scan_request("UTF-8", (twx_encoding.empty()
489  ? "XML-standalone"
490  : twx_encoding));
491  };
492 
493 
494  /*----------------------------------------------------
495  * TokenWriterExpat: Destructor
496  */
498  virtual ~TokenWriterExpat(void)
499  {
500  close();
501  };
502 
503  /*----------------------------------------
504  * Writer: Expat: Methods: Output Selection
505  */
509  virtual void to_mstream(mootio::mostream *os);
512  virtual void close(void);
514 
515  /*----------------------------------------
516  * Writer: Expat: Methods: Output
517  */
521  virtual void put_token(const mootToken &token) {
522  _put_token(token,tw_ostream);
523  };
524 
526  virtual void put_tokens(const mootSentence &tokens) {
527  _put_tokens(tokens,tw_ostream);
528  };
529 
531  virtual void put_sentence(const mootSentence &sentence) {
532  _put_sentence(sentence,tw_ostream);
533  };
534 
536  virtual void put_comment_block_begin(void) {
537  _put_comment_block_begin(tw_ostream);
538  };
539 
541  virtual void put_comment_block_end(void) {
542  _put_comment_block_end(tw_ostream);
543  };
544 
546  virtual void put_raw_buffer(const char *buf, size_t len) {
547  _put_raw_buffer(buf,len,tw_ostream);
548  };
550 
551  /*----------------------------------------
552  * Writer: Expat: Methods: Utilities
553  */
557  void _put_token_raw(const mootToken &token, mootio::mostream *os);
558 
560  void _put_token_gen(const mootToken &token, mootio::mostream *os);
561 
563  inline void _put_token(const mootToken &token, mootio::mostream *os)
564  {
565  if (use_raw_xml) _put_token_raw(token,os);
566  else _put_token_gen(token,os);
567  };
568 
570  inline void _put_tokens(const mootSentence &tokens, mootio::mostream *os)
571  {
572  if (!os || (tw_format&tiofNone) || !os->valid()) return;
573  if (use_raw_xml) {
574  for (mootSentence::const_iterator si=tokens.begin(); si!=tokens.end(); si++) _put_token_raw(*si, os);
575  } else {
576  for (mootSentence::const_iterator si=tokens.begin(); si!=tokens.end(); si++) _put_token_gen(*si, os);
577  }
578  };
579 
581  inline void _put_sentence(const mootSentence &sentence, mootio::mostream *os)
582  {
583  if (!os || (tw_format&tiofNone) || !os->valid()) return;
584  if (use_raw_xml) {
585  for (mootSentence::const_iterator si=sentence.begin(); si!=sentence.end(); si++) _put_token_raw(*si, os);
586  } else {
587  for (mootSentence::const_iterator si=sentence.begin(); si!=sentence.end(); si++) _put_token_gen(*si, os);
588  _put_token_gen(mootToken(TokTypeEOS), os);
589  }
590  };
591 
593  void _put_comment_block_begin(mootio::mostream *os);
594 
596  void _put_comment_block_end(mootio::mostream *os);
597 
599  void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
601 };
602 
603 moot_END_NAMESPACE
604 
605 #endif // moot_EXPAT_ENABLED
606 
607 #endif // MOOT_EXPAT_TOKEN_IO_H
Experimental XML writer class for use with expat-parsed XML or vanilla input.
Definition: mootTokenExpatIO.h:385
Conserve raw XML.
Definition: mootTokenIO.h:55
interface to librecode (optional)
size_t cb_offset
current read offset position in buffer
Definition: mootBufferIO.h:64
safely includes autoheader preprocessor macros
Definition: mootToken.h:76
slist< int > NodeInfoStack
Definition: mootTokenExpatIO.h:94
Abstract class for token input.
Definition: mootTokenIO.h:208
XML format.
Definition: mootTokenIO.h:54
Abstract base class for output stream wrappers.
Definition: mootIO.h:194
High-level token information object.
Definition: mootToken.h:96
void scan_request(const std::string &reqstr)
Definition: mootRecode.h:371
Utility class for expat input contexts.
Definition: mootExpatParser.h:69
no format
Definition: mootTokenIO.h:49
Experimental XML reader class using expat.
Definition: mootTokenExpatIO.h:62
C++ Wrapper for expat XML parsers.
Definition: mootExpatParser.h:56
size_t cb_used
used length of buffer (in bytes)
Definition: mootBufferIO.h:65
Special 2-phase recoder object for XML text.
Definition: mootRecode.h:314
moot::OffsetT ByteOffset
typedef for (byte) offsets (may be unsigned)
Definition: mootIO.h:55
list< mootToken > mootSentence
Definition: mootToken.h:630
mootTokenTypeE
Definition: mootToken.h:71
Abstract class for token output.
Definition: mootTokenIO.h:700
Abstract and native classes for I/O of moot::mootToken objects.
Definition: mootToken.h:78
virtual bool valid(void)
Definition: mootIO.h:99
C++ wrapper class for generic expat XML parsers (optional)
const char * cb_rdata
underlying character data buffer
Definition: mootBufferIO.h:63
Streambuf-like class for input from C char* buffers.
Definition: mootBufferIO.h:60
Abstract base class for input stream wrappers.
Definition: mootIO.h:129
xmlNodeFlags
Definition: mootTokenExpatIO.h:71