00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _MOOT_TOKEN_EXPAT_IO_H
00030 #define _MOOT_TOKEN_EXPAT_IO_H
00031
00032 #include <mootConfig.h>
00033
00034 #ifdef MOOT_EXPAT_ENABLED
00035
00036 #include <assert.h>
00037
00038 #include <mootTypes.h>
00039 #include <mootToken.h>
00040 #include <mootTokenIO.h>
00041 #include <mootExpatParser.h>
00042 #include <mootRecode.h>
00043
00044 #ifdef __GNUC__
00045 # include <ext/slist>
00046 #else
00047 # include <slist>
00048 #endif
00049
00050 #include <list>
00051
00052 moot_BEGIN_NAMESPACE
00053
00054 using namespace std;
00055
00056
00057
00058
00059
00060
00061
00062
00064 class TokenReaderExpat : public TokenReader, public mootExpatParser {
00065 public:
00066
00067
00068
00073 typedef enum {
00074 TRX_Default = 0x00000000,
00075 TRX_IsOuter = 0x00000001,
00076 TRX_IsRoot = 0x00000002,
00077 TRX_IsBodyE = 0x00000004,
00078 TRX_IsBodyD = 0x00000008,
00079 TRX_IsTokenE = 0x00000010,
00080 TRX_IsTokenD = 0x00000020,
00081 TRX_IsTokTextE = 0x00000040,
00082 TRX_IsTokTextD = 0x00000080,
00083 TRX_IsAnalysisE = 0x00000100,
00084 TRX_IsAnalysisD = 0x00000200,
00085 TRX_IsBestTagE = 0x00000400,
00086 TRX_IsBestTagD = 0x00000800,
00087 TRX_All = 0xffffffff
00088 } xmlNodeFlags;
00089
00091 const static int defaultNodeInheritanceMask
00092 = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD;
00093
00095
00096 typedef slist<int> NodeInfoStack;
00097
00098 public:
00099
00100
00101
00102
00103
00104
00105
00108 bool save_raw_xml;
00109
00110 std::string body_elt;
00111 std::string eos_elt;
00112 std::string token_elt;
00113 std::string text_elt;
00114 std::string analysis_elt;
00115 std::string postag_attr;
00116 std::string besttag_elt;
00117
00118
00119
00122
00123 NodeInfoStack stack;
00124 int done;
00125
00126
00127 mootSentence cb_nxtsent;
00128 mootToken *cb_nxttok;
00129
00130
00131 mootSentence cb_fullsents;
00132 mootSentence trx_sentbuf;
00133
00134
00135 public:
00136
00137
00138
00141
00146 TokenReaderExpat(int fmt =tiofXML,
00147 size_t buflen =MOOT_DEFAULT_EXPAT_BUFLEN,
00148
00149 const std::string &encoding ="",
00150 const std::string &name ="TokenReaderExpat")
00151 : TokenReader(fmt,name),
00152 mootExpatParser(buflen,encoding),
00153 save_raw_xml(false),
00154 body_elt(""),
00155 eos_elt("eos"),
00156 token_elt("token"),
00157 text_elt("text"),
00158 analysis_elt("analysis"),
00159 postag_attr("pos"),
00160 besttag_elt("moot.tag"),
00161 done(1)
00162 {
00163
00164 tr_sentence = &trx_sentbuf;
00165 tr_token = NULL;
00166
00167 save_raw_xml = tr_format & tiofConserve;
00168 };
00169
00170
00171
00172
00174 virtual ~TokenReaderExpat(void) {};
00175
00176
00177
00178
00180 virtual void reset(void);
00182
00183
00186
00190 virtual void reader_name(const std::string &myname)
00191 {
00192 TokenReader::reader_name(myname);
00193
00194 };
00195
00197 virtual void close(void);
00198
00199 virtual void from_mstream(mootio::mistream *mistreamp) {
00200 TokenReader::from_mstream(mistreamp);
00201 mootExpatParser::from_mstream(tr_istream);
00202 done = 0;
00203 };
00204 virtual void from_mstream(mootio::mistream &mis) {
00205 TokenReader::from_mstream(mis);
00206 mootExpatParser::from_mstream(tr_istream);
00207 done = 0;
00208 };
00209 virtual void from_filename(const char *filename) {
00210 TokenReader::from_filename(filename);
00211 mootExpatParser::from_mstream(tr_istream);
00212 };
00213 virtual void from_file(FILE *infile) {
00214 TokenReader::from_file(infile);
00215 mootExpatParser::from_mstream(tr_istream);
00216 };
00217 virtual void from_fd(int fd) {
00218 TokenReader::from_fd(fd);
00219 mootExpatParser::from_mstream(tr_istream);
00220 };
00221 virtual void from_buffer(const void *buf, size_t len) {
00222 TokenReader::from_buffer(buf,len);
00223 mootExpatParser::from_mstream(tr_istream);
00224 };
00225 virtual void from_cxxstream(std::istream &is) {
00226 TokenReader::from_cxxstream(is);
00227 mootExpatParser::from_mstream(tr_istream);
00228 };
00230
00231
00233
00234
00239 virtual mootTokenType get_token(void);
00240
00245 virtual mootTokenType get_sentence(void);
00247
00248
00251
00252
00253
00254
00262 bool ensure_cb_fullsents(void);
00263
00265 inline int next_node_info(int emptyStackValue=TRX_IsOuter,
00266 int inheritanceMask=defaultNodeInheritanceMask)
00267 {
00268 return (stack.empty()
00269 ? emptyStackValue
00270 : (stack.front() & inheritanceMask));
00271 };
00272
00274 inline int top_node_info(int emptyStackValue=TRX_IsOuter)
00275 {
00276 return stack.empty() ? emptyStackValue : stack.front();
00277 };
00278
00282 inline void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0)
00283 {
00284 if (!save_raw_xml && toktype == TokTypeXMLRaw) return;
00285 if (!info) info = top_node_info();
00286 ContextBuffer ctb(parser);
00287 save_context_data(ctb, toktype, info);
00288 };
00289
00291 void save_context_data(const mootio::micbuffer &buf,
00292 mootTokenType toktype=TokTypeXMLRaw,
00293 int info=0)
00294 {
00295 save_context_data(buf.cb_rdata + buf.cb_offset,
00296 buf.cb_used - buf.cb_offset,
00297 toktype, info);
00298 };
00299
00301 void save_context_data(const char *text, size_t len,
00302 mootTokenType toktype=TokTypeXMLRaw,
00303 int info=0);
00305
00306
00307
00308
00311 virtual void XmlDeclHandler(const XML_Char *version,
00312 const XML_Char *encoding,
00313 int standalone);
00314 virtual void StartElementHandler(const char *el, const char **attr);
00315 virtual void EndElementHandler(const char *el);
00316 virtual void CharacterDataHandler(const XML_Char *s, int len);
00317 virtual void CommentHandler(const XML_Char *s);
00318 virtual void DefaultHandler(const XML_Char *s, int len);
00320
00321
00325 virtual size_t line_number(void) {
00326 return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00327 };
00328
00330 virtual size_t line_number(size_t n) { return line_number(); };
00331
00333 virtual size_t column_number(void) {
00334 return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00335 };
00336
00338 virtual size_t column_number(size_t n) { return column_number(); };
00339
00341 virtual size_t byte_number(void) {
00342 return parser ? ((size_t)XML_GetCurrentByteIndex(parser)) : 0;
00343 };
00344
00346 virtual size_t byte_number(size_t n) { return byte_number(); };
00347
00349 virtual void carp(char *fmt, ...);
00351 };
00352
00353 moot_END_NAMESPACE
00354
00355 #endif // moot_EXPAT_ENABLED
00356
00357
00358 moot_BEGIN_NAMESPACE
00359
00360
00361
00362
00363
00364
00365
00366
00371 class TokenWriterExpat : public TokenWriter {
00372 public:
00373
00374
00375
00376
00377
00378
00379
00382
00404 bool use_raw_xml;
00405
00406 std::string root_elt;
00407 std::string eos_elt;
00408 std::string token_elt;
00409 std::string text_elt;
00410 std::string analysis_elt;
00411 std::string postag_attr;
00412
00413 std::string besttag_elt;
00414
00415
00416
00420 std::string twx_encoding;
00421
00423 mootXMLRecoder twx_recoder;
00424
00426 int lastc;
00428
00429 public:
00430
00431
00432
00435
00444 TokenWriterExpat(int fmt =tiofXML
00445 , bool got_raw_xml =false
00446 , const std::string &encoding =""
00447 );
00448
00449
00450
00451
00453 inline void setEncoding(const std::string &encoding="")
00454 {
00455 twx_encoding = encoding;
00456 twx_recoder.scan_request("UTF-8", (twx_encoding.empty()
00457 ? "XML-standalone"
00458 : twx_encoding));
00459 };
00460
00461
00462
00463
00464
00466 virtual ~TokenWriterExpat(void)
00467 {
00468 close();
00469 };
00470
00471
00472
00473
00477 virtual void to_mstream(mootio::mostream *os);
00478
00480 virtual void close(void);
00482
00483
00484
00485
00489 virtual void put_token(const mootToken &token) {
00490 _put_token(token,tw_ostream);
00491 };
00492
00494 virtual void put_sentence(const mootSentence &sentence) {
00495 _put_sentence(sentence,tw_ostream);
00496 };
00497
00499 virtual void put_comment_block_begin(void) {
00500 _put_comment_block_begin(tw_ostream);
00501 };
00502
00504 virtual void put_comment_block_end(void) {
00505 _put_comment_block_end(tw_ostream);
00506 };
00507
00509 virtual void put_raw_buffer(const char *buf, size_t len) {
00510 _put_raw_buffer(buf,len,tw_ostream);
00511 };
00513
00514
00515
00516
00520 void _put_token_raw(const mootToken &token, mootio::mostream *os);
00521
00523 void _put_token_gen(const mootToken &token, mootio::mostream *os);
00524
00526 inline void _put_token(const mootToken &token, mootio::mostream *os)
00527 {
00528 if (use_raw_xml) _put_token_raw(token,os);
00529 else _put_token_gen(token,os);
00530 };
00531
00533 inline void _put_sentence(const mootSentence &sentence, mootio::mostream *os)
00534 {
00535 if (!os || (tw_format&tiofNone) || !os->valid()) return;
00536 mootSentence::const_iterator si;
00537 if (use_raw_xml) {
00538 for (si = sentence.begin(); si != sentence.end(); si++) _put_token_raw(*si, os);
00539 } else {
00540 for (si = sentence.begin(); si != sentence.end(); si++) _put_token_gen(*si, os);
00541 _put_token_gen(mootToken(TokTypeEOS), os);
00542 }
00543 };
00544
00546 void _put_comment_block_begin(mootio::mostream *os);
00547
00549 void _put_comment_block_end(mootio::mostream *os);
00550
00552 void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00554 };
00555
00556 moot_END_NAMESPACE
00557
00558 #endif // MOOT_EXPAT_TOKEN_IO_H