Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootTokenExpatIO.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootTokenExpatIO.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token I/O : XML: Expat
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_TOKEN_EXPAT_IO_H
00030 #define _MOOT_TOKEN_EXPAT_IO_H
00031 
00032 #include <mootConfig.h>
00033 
00034 #ifdef MOOT_EXPAT_ENABLED
00035 
00036 #include <assert.h>
00037 
00038 #include <mootTypes.h>
00039 #include <mootToken.h>
00040 #include <mootTokenIO.h>
00041 #include <mootExpatParser.h>
00042 #include <mootRecode.h>
00043 
00044 #ifdef __GNUC__
00045 # include <ext/slist>
00046 #else
00047 # include <slist>
00048 #endif
00049 
00050 #include <list>
00051 
00052 moot_BEGIN_NAMESPACE
00053 
00054 using namespace std;
00055 
00056 /*======================================================================
00057  * mootTokenExpatIO
00058  *======================================================================*/
00059 
00060 /*--------------------------------------------------------------------------
00061  * TokenReaderExpat
00062  */
00064 class TokenReaderExpat : public TokenReader, public mootExpatParser {
00065 public:
00066   /*----------------------------------------------------
00067    * TokenReaderExpat: Types
00068    */
00073   typedef enum {
00074     TRX_Default     = 0x00000000,  
00075     TRX_IsOuter     = 0x00000001,  
00076     TRX_IsRoot      = 0x00000002,  
00077     TRX_IsBodyE     = 0x00000004,  
00078     TRX_IsBodyD     = 0x00000008,  
00079     TRX_IsTokenE    = 0x00000010,  
00080     TRX_IsTokenD    = 0x00000020,  
00081     TRX_IsTokTextE  = 0x00000040,  
00082     TRX_IsTokTextD  = 0x00000080,  
00083     TRX_IsAnalysisE = 0x00000100,  
00084     TRX_IsAnalysisD = 0x00000200,  
00085     TRX_IsBestTagE  = 0x00000400,  
00086     TRX_IsBestTagD  = 0x00000800,  
00087     TRX_All         = 0xffffffff   
00088   } xmlNodeFlags;
00089 
00091   const static int defaultNodeInheritanceMask
00092    = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD;
00093 
00095   //typedef list<int> NodeInfoStack;
00096   typedef slist<int> NodeInfoStack;
00097 
00098 public:
00099   /*----------------------------------------------------
00100    * TokenReaderExpat: Data
00101    */
00102   //----------------------------
00103   /* I/O Behavior */
00104 
00105   //----------------------------
00108   bool        save_raw_xml;  
00109 
00110   std::string body_elt;      
00111   std::string eos_elt;       
00112   std::string token_elt;     
00113   std::string text_elt;      
00114   std::string analysis_elt;  
00115   std::string postag_attr;   
00116   std::string besttag_elt;   
00117 
00118 
00119   //----------------------------
00122   //-- position tracking
00123   NodeInfoStack         stack;      
00124   int                   done;       
00125 
00126   //-- construction buffers
00127   mootSentence          cb_nxtsent; 
00128   mootToken            *cb_nxttok;  
00129   //mootToken::Analysis  *cb_nxtanl;  ///< Construction buffer for analyses (points into nxttok)
00130 
00131   //-- output buffers
00132   mootSentence         cb_fullsents; 
00133   mootSentence         trx_sentbuf;  
00134 
00135  
00136  public:
00137   /*----------------------------------------------------
00138    * TokenReaderExpat: Constructor
00139    */
00142 
00147   TokenReaderExpat(int                fmt      =tiofXML,
00148                    size_t             buflen   =MOOT_DEFAULT_EXPAT_BUFLEN,
00149                    //size_t           buflen   =128, //-- DEBUG
00150                    const std::string &encoding ="",
00151                    const std::string &name     ="TokenReaderExpat")
00152     : TokenReader(fmt,name),
00153       mootExpatParser(buflen,encoding),
00154       save_raw_xml(false),
00155       body_elt(""),
00156       eos_elt("eos"),
00157       token_elt("token"),
00158       text_elt("text"),
00159       analysis_elt("analysis"),
00160       postag_attr("pos"),
00161       besttag_elt("moot.tag"),
00162       done(1)
00163   {
00164     //-- TokenReader pointers
00165     tr_sentence = &trx_sentbuf;
00166     tr_token    = NULL;
00167 
00168     save_raw_xml = tr_format & tiofConserve;
00169   };
00170 
00171   /*----------------------------------------------------
00172    * TokenReaderExpat: Destructor
00173    */
00175   virtual ~TokenReaderExpat(void) {};
00176 
00177   /*----------------------------------------------------
00178    * TokenReaderExpat: Reset
00179    */
00181   virtual void reset(void);
00183 
00184   /*----------------------------------------------------*/
00187 
00191   virtual void reader_name(const std::string &myname)
00192   {
00193     TokenReader::reader_name(myname);
00194     //mootExpatParser::setSrcName(myname);
00195   };
00196 
00198   virtual void close(void);
00199 
00200   virtual void from_mstream(mootio::mistream *mistreamp) {
00201     TokenReader::from_mstream(mistreamp);
00202     mootExpatParser::from_mstream(tr_istream);
00203     done = 0;
00204   };
00205   virtual void from_mstream(mootio::mistream &mis) {
00206     TokenReader::from_mstream(mis);
00207     mootExpatParser::from_mstream(tr_istream);
00208     done = 0;
00209   };
00210   virtual void from_filename(const char *filename) {
00211     TokenReader::from_filename(filename);
00212     mootExpatParser::from_mstream(tr_istream);
00213   };
00214   virtual void from_file(FILE *infile) {
00215     TokenReader::from_file(infile);
00216     mootExpatParser::from_mstream(tr_istream);
00217   };
00218   virtual void from_fd(int fd) {
00219     TokenReader::from_fd(fd);
00220     mootExpatParser::from_mstream(tr_istream);
00221   };
00222   virtual void from_buffer(const void *buf, size_t len) {
00223     TokenReader::from_buffer(buf,len);
00224     mootExpatParser::from_mstream(tr_istream);
00225   };
00226   virtual void from_cxxstream(std::istream &is) {
00227     TokenReader::from_cxxstream(is);
00228     mootExpatParser::from_mstream(tr_istream);
00229   };
00231 
00232   /*----------------------------------------------------*/
00234 
00235 
00240   virtual mootTokenType get_token(void);
00241 
00246   virtual mootTokenType get_sentence(void);
00248 
00249   /*----------------------------------------------------*/
00252 
00253   /*----------------------------------------------------
00254    * TokenReaderExpat: XML Utilities
00255    */
00263   bool ensure_cb_fullsents(void);
00264 
00266   inline int next_node_info(int emptyStackValue=TRX_IsOuter,
00267                             int inheritanceMask=defaultNodeInheritanceMask)
00268   {
00269     return (stack.empty()
00270             ? emptyStackValue
00271             : (stack.front() & inheritanceMask));
00272   };
00273 
00275   inline int top_node_info(int emptyStackValue=TRX_IsOuter)
00276   {
00277     return stack.empty() ? emptyStackValue : stack.front();
00278   };
00279 
00283   inline void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0)
00284   {
00285     if (!save_raw_xml && toktype == TokTypeXMLRaw) return;
00286     if (!info) info = top_node_info();
00287     ContextBuffer ctb(parser);
00288     save_context_data(ctb, toktype, info);
00289   };
00290 
00292   void save_context_data(const mootio::micbuffer &buf,
00293                          mootTokenType toktype=TokTypeXMLRaw,
00294                          int info=0)
00295   {
00296     save_context_data(buf.cb_rdata + buf.cb_offset,
00297                       buf.cb_used  - buf.cb_offset,
00298                       toktype, info);
00299   };
00300 
00302   void save_context_data(const char *text, size_t len,
00303                          mootTokenType toktype=TokTypeXMLRaw,
00304                          int info=0);
00306   
00307   /*----------------------------------------------------
00308    * TokenReaderExpat: Expat Handlers
00309    */
00312   virtual void XmlDeclHandler(const XML_Char  *version,
00313                               const XML_Char  *encoding,
00314                               int             standalone);
00315   virtual void StartElementHandler(const char *el, const char **attr);
00316   virtual void EndElementHandler(const char *el);
00317   virtual void CharacterDataHandler(const XML_Char *s, int len);
00318   virtual void CommentHandler(const XML_Char *s);
00319   virtual void DefaultHandler(const XML_Char *s, int len);
00321 
00322   /*----------------------------------------------------*/
00326   virtual size_t line_number(void) {
00327       return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00328   };
00329 
00331   virtual size_t line_number(size_t n) { return line_number(); };
00332 
00334   virtual size_t column_number(void) {
00335       return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00336   };
00337 
00339   virtual size_t column_number(size_t n) { return column_number(); };
00340 
00342   virtual size_t byte_number(void) {
00343       return parser ? ((size_t)XML_GetCurrentByteIndex(parser)) : 0;
00344   };
00345 
00347   virtual size_t byte_number(size_t n) { return byte_number(); };
00348 
00350   virtual void carp(char *fmt, ...);
00352 };
00353 
00354 moot_END_NAMESPACE
00355 
00356 #endif // moot_EXPAT_ENABLED
00357 
00358 
00359 moot_BEGIN_NAMESPACE
00360 
00361 /*======================================================================
00362  * WRITER
00363  *======================================================================*/
00364 
00365 /*--------------------------------------------------------------------------
00366  * TokenWriterExpat
00367  */
00372 class TokenWriterExpat : public TokenWriter {
00373 public:
00374   /*----------------------------------------------------
00375    * TokenWriterExpat: Data
00376    */
00377   //----------------------------
00378   /* I/O Behavior */
00379 
00380   //----------------------------
00383 
00405   bool        use_raw_xml;
00406 
00407   std::string root_elt;      
00408   std::string eos_elt;       
00409   std::string token_elt;     
00410   std::string text_elt;      
00411   std::string analysis_elt;  
00412   std::string postag_attr;   
00413 
00414   std::string besttag_elt;   
00415 
00416 
00417   //----------------------------
00421   std::string    twx_encoding;
00422 
00424   mootXMLRecoder twx_recoder;
00425 
00427   int lastc;
00429  
00430  public:
00431   /*----------------------------------------------------
00432    * TokenReaderExpat: Constructor
00433    */
00436 
00445   TokenWriterExpat(int                   fmt         =tiofXML
00446                    , bool                got_raw_xml =false
00447                    , const std::string  &encoding    =""
00448                    );
00449 
00450   /*----------------------------------------------------
00451    * TokenWriterExpat: encoding
00452    */
00454   inline void setEncoding(const std::string &encoding="")
00455   {
00456     twx_encoding = encoding;
00457     twx_recoder.scan_request("UTF-8", (twx_encoding.empty()
00458                                        ? "XML-standalone"
00459                                        : twx_encoding));
00460   };
00461 
00462 
00463   /*----------------------------------------------------
00464    * TokenWriterExpat: Destructor
00465    */
00467   virtual ~TokenWriterExpat(void)
00468   {
00469     close();
00470   };
00471 
00472   /*----------------------------------------
00473    * Writer: Expat: Methods: Output Selection
00474    */
00478   virtual void to_mstream(mootio::mostream *os);
00479 
00481   virtual void close(void);
00483 
00484   /*----------------------------------------
00485    * Writer: Expat: Methods: Output
00486    */
00490   virtual void put_token(const mootToken &token) {
00491     _put_token(token,tw_ostream);
00492   };
00493 
00495   virtual void put_sentence(const mootSentence &sentence) {
00496     _put_sentence(sentence,tw_ostream);
00497   };
00498 
00500   virtual void put_comment_block_begin(void) {
00501     _put_comment_block_begin(tw_ostream);
00502   };
00503 
00505   virtual void put_comment_block_end(void) {
00506     _put_comment_block_end(tw_ostream);
00507   };
00508 
00510   virtual void put_raw_buffer(const char *buf, size_t len) {
00511     _put_raw_buffer(buf,len,tw_ostream);
00512   };
00514 
00515   /*----------------------------------------
00516    * Writer: Expat: Methods: Utilities
00517    */
00521   void _put_token_raw(const mootToken &token, mootio::mostream *os);
00522 
00524   void _put_token_gen(const mootToken &token, mootio::mostream *os);
00525 
00527   inline void _put_token(const mootToken &token, mootio::mostream *os)
00528   {
00529     if (use_raw_xml)  _put_token_raw(token,os);
00530     else              _put_token_gen(token,os);
00531   };
00532 
00534   inline void _put_sentence(const mootSentence &sentence, mootio::mostream *os)
00535   {
00536     if (!os || (tw_format&tiofNone) || !os->valid()) return;
00537     mootSentence::const_iterator si;
00538     if (use_raw_xml) {
00539       for (si = sentence.begin(); si != sentence.end(); si++) _put_token_raw(*si, os);
00540     } else {
00541       for (si = sentence.begin(); si != sentence.end(); si++) _put_token_gen(*si, os);
00542       _put_token_gen(mootToken(TokTypeEOS), os);
00543     }
00544   };
00545 
00547   void _put_comment_block_begin(mootio::mostream *os);
00548 
00550   void _put_comment_block_end(mootio::mostream *os);
00551 
00553   void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00555 };
00556 
00557 moot_END_NAMESPACE
00558 
00559 #endif // MOOT_EXPAT_TOKEN_IO_H

Generated on Wed Jul 28 15:48:03 2004 for libmoot by doxygen1.2.15