00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _MOOT_TOKEN_EXPAT_IO_H
00030 #define _MOOT_TOKEN_EXPAT_IO_H
00031
00032 #include <mootConfig.h>
00033
00034 #ifdef MOOT_EXPAT_ENABLED
00035
00036 #include <assert.h>
00037
00038 #include <mootTypes.h>
00039 #include <mootToken.h>
00040 #include <mootTokenIO.h>
00041 #include <mootExpatParser.h>
00042 #include <mootRecode.h>
00043
00044 #ifdef __GNUC__
00045 # include <ext/slist>
00046 #else
00047 # include <slist>
00048 #endif
00049
00050 #include <list>
00051
00052 moot_BEGIN_NAMESPACE
00053
00054 using namespace std;
00055
00056
00057
00058
00059
00060
00061
00062
00064 class TokenReaderExpat : public TokenReader, public mootExpatParser {
00065 public:
00066
00067
00068
00073 typedef enum {
00074 TRX_Default = 0x00000000,
00075 TRX_IsOuter = 0x00000001,
00076 TRX_IsRoot = 0x00000002,
00077 TRX_IsBodyE = 0x00000004,
00078 TRX_IsBodyD = 0x00000008,
00079 TRX_IsTokenE = 0x00000010,
00080 TRX_IsTokenD = 0x00000020,
00081 TRX_IsTokTextE = 0x00000040,
00082 TRX_IsTokTextD = 0x00000080,
00083 TRX_IsAnalysisE = 0x00000100,
00084 TRX_IsAnalysisD = 0x00000200,
00085 TRX_IsBestTagE = 0x00000400,
00086 TRX_IsBestTagD = 0x00000800,
00087 TRX_All = 0xffffffff
00088 } xmlNodeFlags;
00089
00091 const static int defaultNodeInheritanceMask
00092 = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD;
00093
00095
00096 typedef slist<int> NodeInfoStack;
00097
00098 public:
00099
00100
00101
00102
00103
00104
00105
00108 bool save_raw_xml;
00109
00110 std::string body_elt;
00111 std::string eos_elt;
00112 std::string token_elt;
00113 std::string text_elt;
00114 std::string analysis_elt;
00115 std::string postag_attr;
00116 std::string besttag_elt;
00117
00118
00119
00122
00123 NodeInfoStack stack;
00124 int done;
00125
00126
00127 mootSentence cb_nxtsent;
00128 mootToken *cb_nxttok;
00129
00130
00131
00132 mootSentence cb_fullsents;
00133 mootSentence trx_sentbuf;
00134
00135
00136 public:
00137
00138
00139
00142
00147 TokenReaderExpat(int fmt =tiofXML,
00148 size_t buflen =MOOT_DEFAULT_EXPAT_BUFLEN,
00149
00150 const std::string &encoding ="",
00151 const std::string &name ="TokenReaderExpat")
00152 : TokenReader(fmt,name),
00153 mootExpatParser(buflen,encoding),
00154 save_raw_xml(false),
00155 body_elt(""),
00156 eos_elt("eos"),
00157 token_elt("token"),
00158 text_elt("text"),
00159 analysis_elt("analysis"),
00160 postag_attr("pos"),
00161 besttag_elt("moot.tag"),
00162 done(1)
00163 {
00164
00165 tr_sentence = &trx_sentbuf;
00166 tr_token = NULL;
00167
00168 save_raw_xml = tr_format & tiofConserve;
00169 };
00170
00171
00172
00173
00175 virtual ~TokenReaderExpat(void) {};
00176
00177
00178
00179
00181 virtual void reset(void);
00183
00184
00187
00191 virtual void reader_name(const std::string &myname)
00192 {
00193 TokenReader::reader_name(myname);
00194
00195 };
00196
00198 virtual void close(void);
00199
00200 virtual void from_mstream(mootio::mistream *mistreamp) {
00201 TokenReader::from_mstream(mistreamp);
00202 mootExpatParser::from_mstream(tr_istream);
00203 done = 0;
00204 };
00205 virtual void from_mstream(mootio::mistream &mis) {
00206 TokenReader::from_mstream(mis);
00207 mootExpatParser::from_mstream(tr_istream);
00208 done = 0;
00209 };
00210 virtual void from_filename(const char *filename) {
00211 TokenReader::from_filename(filename);
00212 mootExpatParser::from_mstream(tr_istream);
00213 };
00214 virtual void from_file(FILE *infile) {
00215 TokenReader::from_file(infile);
00216 mootExpatParser::from_mstream(tr_istream);
00217 };
00218 virtual void from_fd(int fd) {
00219 TokenReader::from_fd(fd);
00220 mootExpatParser::from_mstream(tr_istream);
00221 };
00222 virtual void from_buffer(const void *buf, size_t len) {
00223 TokenReader::from_buffer(buf,len);
00224 mootExpatParser::from_mstream(tr_istream);
00225 };
00226 virtual void from_cxxstream(std::istream &is) {
00227 TokenReader::from_cxxstream(is);
00228 mootExpatParser::from_mstream(tr_istream);
00229 };
00231
00232
00234
00235
00240 virtual mootTokenType get_token(void);
00241
00246 virtual mootTokenType get_sentence(void);
00248
00249
00252
00253
00254
00255
00263 bool ensure_cb_fullsents(void);
00264
00266 inline int next_node_info(int emptyStackValue=TRX_IsOuter,
00267 int inheritanceMask=defaultNodeInheritanceMask)
00268 {
00269 return (stack.empty()
00270 ? emptyStackValue
00271 : (stack.front() & inheritanceMask));
00272 };
00273
00275 inline int top_node_info(int emptyStackValue=TRX_IsOuter)
00276 {
00277 return stack.empty() ? emptyStackValue : stack.front();
00278 };
00279
00283 inline void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0)
00284 {
00285 if (!save_raw_xml && toktype == TokTypeXMLRaw) return;
00286 if (!info) info = top_node_info();
00287 ContextBuffer ctb(parser);
00288 save_context_data(ctb, toktype, info);
00289 };
00290
00292 void save_context_data(const mootio::micbuffer &buf,
00293 mootTokenType toktype=TokTypeXMLRaw,
00294 int info=0)
00295 {
00296 save_context_data(buf.cb_rdata + buf.cb_offset,
00297 buf.cb_used - buf.cb_offset,
00298 toktype, info);
00299 };
00300
00302 void save_context_data(const char *text, size_t len,
00303 mootTokenType toktype=TokTypeXMLRaw,
00304 int info=0);
00306
00307
00308
00309
00312 virtual void XmlDeclHandler(const XML_Char *version,
00313 const XML_Char *encoding,
00314 int standalone);
00315 virtual void StartElementHandler(const char *el, const char **attr);
00316 virtual void EndElementHandler(const char *el);
00317 virtual void CharacterDataHandler(const XML_Char *s, int len);
00318 virtual void CommentHandler(const XML_Char *s);
00319 virtual void DefaultHandler(const XML_Char *s, int len);
00321
00322
00326 virtual size_t line_number(void) {
00327 return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00328 };
00329
00331 virtual size_t line_number(size_t n) { return line_number(); };
00332
00334 virtual size_t column_number(void) {
00335 return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00336 };
00337
00339 virtual size_t column_number(size_t n) { return column_number(); };
00340
00342 virtual size_t byte_number(void) {
00343 return parser ? ((size_t)XML_GetCurrentByteIndex(parser)) : 0;
00344 };
00345
00347 virtual size_t byte_number(size_t n) { return byte_number(); };
00348
00350 virtual void carp(char *fmt, ...);
00352 };
00353
00354 moot_END_NAMESPACE
00355
00356 #endif // moot_EXPAT_ENABLED
00357
00358
00359 moot_BEGIN_NAMESPACE
00360
00361
00362
00363
00364
00365
00366
00367
00372 class TokenWriterExpat : public TokenWriter {
00373 public:
00374
00375
00376
00377
00378
00379
00380
00383
00405 bool use_raw_xml;
00406
00407 std::string root_elt;
00408 std::string eos_elt;
00409 std::string token_elt;
00410 std::string text_elt;
00411 std::string analysis_elt;
00412 std::string postag_attr;
00413
00414 std::string besttag_elt;
00415
00416
00417
00421 std::string twx_encoding;
00422
00424 mootXMLRecoder twx_recoder;
00425
00427 int lastc;
00429
00430 public:
00431
00432
00433
00436
00445 TokenWriterExpat(int fmt =tiofXML
00446 , bool got_raw_xml =false
00447 , const std::string &encoding =""
00448 );
00449
00450
00451
00452
00454 inline void setEncoding(const std::string &encoding="")
00455 {
00456 twx_encoding = encoding;
00457 twx_recoder.scan_request("UTF-8", (twx_encoding.empty()
00458 ? "XML-standalone"
00459 : twx_encoding));
00460 };
00461
00462
00463
00464
00465
00467 virtual ~TokenWriterExpat(void)
00468 {
00469 close();
00470 };
00471
00472
00473
00474
00478 virtual void to_mstream(mootio::mostream *os);
00479
00481 virtual void close(void);
00483
00484
00485
00486
00490 virtual void put_token(const mootToken &token) {
00491 _put_token(token,tw_ostream);
00492 };
00493
00495 virtual void put_sentence(const mootSentence &sentence) {
00496 _put_sentence(sentence,tw_ostream);
00497 };
00498
00500 virtual void put_comment_block_begin(void) {
00501 _put_comment_block_begin(tw_ostream);
00502 };
00503
00505 virtual void put_comment_block_end(void) {
00506 _put_comment_block_end(tw_ostream);
00507 };
00508
00510 virtual void put_raw_buffer(const char *buf, size_t len) {
00511 _put_raw_buffer(buf,len,tw_ostream);
00512 };
00514
00515
00516
00517
00521 void _put_token_raw(const mootToken &token, mootio::mostream *os);
00522
00524 void _put_token_gen(const mootToken &token, mootio::mostream *os);
00525
00527 inline void _put_token(const mootToken &token, mootio::mostream *os)
00528 {
00529 if (use_raw_xml) _put_token_raw(token,os);
00530 else _put_token_gen(token,os);
00531 };
00532
00534 inline void _put_sentence(const mootSentence &sentence, mootio::mostream *os)
00535 {
00536 if (!os || (tw_format&tiofNone) || !os->valid()) return;
00537 mootSentence::const_iterator si;
00538 if (use_raw_xml) {
00539 for (si = sentence.begin(); si != sentence.end(); si++) _put_token_raw(*si, os);
00540 } else {
00541 for (si = sentence.begin(); si != sentence.end(); si++) _put_token_gen(*si, os);
00542 _put_token_gen(mootToken(TokTypeEOS), os);
00543 }
00544 };
00545
00547 void _put_comment_block_begin(mootio::mostream *os);
00548
00550 void _put_comment_block_end(mootio::mostream *os);
00551
00553 void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00555 };
00556
00557 moot_END_NAMESPACE
00558
00559 #endif // MOOT_EXPAT_TOKEN_IO_H