Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootTokenExpatIO.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootTokenExpatIO.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token I/O : XML: Expat
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_TOKEN_EXPAT_IO_H
00030 #define _MOOT_TOKEN_EXPAT_IO_H
00031 
00032 #include <mootConfig.h>
00033 
00034 #ifdef MOOT_EXPAT_ENABLED
00035 
00036 #include <assert.h>
00037 
00038 #include <mootTypes.h>
00039 #include <mootToken.h>
00040 #include <mootTokenIO.h>
00041 #include <mootExpatParser.h>
00042 #include <mootRecode.h>
00043 
00044 #ifdef __GNUC__
00045 # include <ext/slist>
00046 #else
00047 # include <slist>
00048 #endif
00049 
00050 #include <list>
00051 
00052 moot_BEGIN_NAMESPACE
00053 
00054 using namespace std;
00055 
00056 /*======================================================================
00057  * mootTokenExpatIO
00058  *======================================================================*/
00059 
00060 /*--------------------------------------------------------------------------
00061  * TokenReaderExpat
00062  */
00064 class TokenReaderExpat : public TokenReader, public mootExpatParser {
00065 public:
00066   /*----------------------------------------------------
00067    * TokenReaderExpat: Types
00068    */
00073   typedef enum {
00074     TRX_Default     = 0x00000000,  
00075     TRX_IsOuter     = 0x00000001,  
00076     TRX_IsRoot      = 0x00000002,  
00077     TRX_IsBodyE     = 0x00000004,  
00078     TRX_IsBodyD     = 0x00000008,  
00079     TRX_IsTokenE    = 0x00000010,  
00080     TRX_IsTokenD    = 0x00000020,  
00081     TRX_IsTokTextE  = 0x00000040,  
00082     TRX_IsTokTextD  = 0x00000080,  
00083     TRX_IsAnalysisE = 0x00000100,  
00084     TRX_IsAnalysisD = 0x00000200,  
00085     TRX_IsBestTagE  = 0x00000400,  
00086     TRX_IsBestTagD  = 0x00000800,  
00087     TRX_All         = 0xffffffff   
00088   } xmlNodeFlags;
00089 
00091   const static int defaultNodeInheritanceMask
00092    = TRX_IsBodyD|TRX_IsTokenD|TRX_IsTokTextD|TRX_IsAnalysisD|TRX_IsBestTagD;
00093 
00095   //typedef list<int> NodeInfoStack;
00096   typedef slist<int> NodeInfoStack;
00097 
00098 public:
00099   /*----------------------------------------------------
00100    * TokenReaderExpat: Data
00101    */
00102   //----------------------------
00103   /* I/O Behavior */
00104 
00105   //----------------------------
00108   bool        save_raw_xml;  
00109 
00110   std::string body_elt;      
00111   std::string eos_elt;       
00112   std::string token_elt;     
00113   std::string text_elt;      
00114   std::string analysis_elt;  
00115   std::string postag_attr;   
00116   std::string besttag_elt;   
00117 
00118 
00119   //----------------------------
00122   //-- position tracking
00123   NodeInfoStack         stack;      
00124   int                   done;       
00125 
00126   //-- construction buffers
00127   mootSentence          cb_nxtsent; 
00128   mootToken            *cb_nxttok;  
00129 
00130   //-- output buffers
00131   mootSentence         cb_fullsents; 
00132   mootSentence         trx_sentbuf;  
00133 
00134  
00135  public:
00136   /*----------------------------------------------------
00137    * TokenReaderExpat: Constructor
00138    */
00141 
00146   TokenReaderExpat(int                fmt      =tiofXML,
00147                    size_t             buflen   =MOOT_DEFAULT_EXPAT_BUFLEN,
00148                    //size_t           buflen   =128, //-- DEBUG
00149                    const std::string &encoding ="",
00150                    const std::string &name     ="TokenReaderExpat")
00151     : TokenReader(fmt,name),
00152       mootExpatParser(buflen,encoding),
00153       save_raw_xml(false),
00154       body_elt(""),
00155       eos_elt("eos"),
00156       token_elt("token"),
00157       text_elt("text"),
00158       analysis_elt("analysis"),
00159       postag_attr("pos"),
00160       besttag_elt("moot.tag"),
00161       done(1)
00162   {
00163     //-- TokenReader pointers
00164     tr_sentence = &trx_sentbuf;
00165     tr_token    = NULL;
00166 
00167     save_raw_xml = tr_format & tiofConserve;
00168   };
00169 
00170   /*----------------------------------------------------
00171    * TokenReaderExpat: Destructor
00172    */
00174   virtual ~TokenReaderExpat(void) {};
00175 
00176   /*----------------------------------------------------
00177    * TokenReaderExpat: Reset
00178    */
00180   virtual void reset(void);
00182 
00183   /*----------------------------------------------------*/
00186 
00190   virtual void reader_name(const std::string &myname)
00191   {
00192     TokenReader::reader_name(myname);
00193     //mootExpatParser::setSrcName(myname);
00194   };
00195 
00197   virtual void close(void);
00198 
00199   virtual void from_mstream(mootio::mistream *mistreamp) {
00200     TokenReader::from_mstream(mistreamp);
00201     mootExpatParser::from_mstream(tr_istream);
00202     done = 0;
00203   };
00204   virtual void from_mstream(mootio::mistream &mis) {
00205     TokenReader::from_mstream(mis);
00206     mootExpatParser::from_mstream(tr_istream);
00207     done = 0;
00208   };
00209   virtual void from_filename(const char *filename) {
00210     TokenReader::from_filename(filename);
00211     mootExpatParser::from_mstream(tr_istream);
00212   };
00213   virtual void from_file(FILE *infile) {
00214     TokenReader::from_file(infile);
00215     mootExpatParser::from_mstream(tr_istream);
00216   };
00217   virtual void from_fd(int fd) {
00218     TokenReader::from_fd(fd);
00219     mootExpatParser::from_mstream(tr_istream);
00220   };
00221   virtual void from_buffer(const void *buf, size_t len) {
00222     TokenReader::from_buffer(buf,len);
00223     mootExpatParser::from_mstream(tr_istream);
00224   };
00225   virtual void from_cxxstream(std::istream &is) {
00226     TokenReader::from_cxxstream(is);
00227     mootExpatParser::from_mstream(tr_istream);
00228   };
00230 
00231   /*----------------------------------------------------*/
00233 
00234 
00239   virtual mootTokenType get_token(void);
00240 
00245   virtual mootTokenType get_sentence(void);
00247 
00248   /*----------------------------------------------------*/
00251 
00252   /*----------------------------------------------------
00253    * TokenReaderExpat: XML Utilities
00254    */
00262   bool ensure_cb_fullsents(void);
00263 
00265   inline int next_node_info(int emptyStackValue=TRX_IsOuter,
00266                             int inheritanceMask=defaultNodeInheritanceMask)
00267   {
00268     return (stack.empty()
00269             ? emptyStackValue
00270             : (stack.front() & inheritanceMask));
00271   };
00272 
00274   inline int top_node_info(int emptyStackValue=TRX_IsOuter)
00275   {
00276     return stack.empty() ? emptyStackValue : stack.front();
00277   };
00278 
00282   inline void save_context(mootTokenType toktype=TokTypeXMLRaw, int info=0)
00283   {
00284     if (!save_raw_xml && toktype == TokTypeXMLRaw) return;
00285     if (!info) info = top_node_info();
00286     ContextBuffer ctb(parser);
00287     save_context_data(ctb, toktype, info);
00288   };
00289 
00291   void save_context_data(const mootio::micbuffer &buf,
00292                          mootTokenType toktype=TokTypeXMLRaw,
00293                          int info=0)
00294   {
00295     save_context_data(buf.cb_rdata + buf.cb_offset,
00296                       buf.cb_used  - buf.cb_offset,
00297                       toktype, info);
00298   };
00299 
00301   void save_context_data(const char *text, size_t len,
00302                          mootTokenType toktype=TokTypeXMLRaw,
00303                          int info=0);
00305   
00306   /*----------------------------------------------------
00307    * TokenReaderExpat: Expat Handlers
00308    */
00311   virtual void XmlDeclHandler(const XML_Char  *version,
00312                               const XML_Char  *encoding,
00313                               int             standalone);
00314   virtual void StartElementHandler(const char *el, const char **attr);
00315   virtual void EndElementHandler(const char *el);
00316   virtual void CharacterDataHandler(const XML_Char *s, int len);
00317   virtual void CommentHandler(const XML_Char *s);
00318   virtual void DefaultHandler(const XML_Char *s, int len);
00320 
00321   /*----------------------------------------------------*/
00325   virtual size_t line_number(void) {
00326       return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00327   };
00328 
00330   virtual size_t line_number(size_t n) { return line_number(); };
00331 
00333   virtual size_t column_number(void) {
00334       return parser ? ((size_t)XML_GetCurrentLineNumber(parser)) : 0;
00335   };
00336 
00338   virtual size_t column_number(size_t n) { return column_number(); };
00339 
00341   virtual size_t byte_number(void) {
00342       return parser ? ((size_t)XML_GetCurrentByteIndex(parser)) : 0;
00343   };
00344 
00346   virtual size_t byte_number(size_t n) { return byte_number(); };
00347 
00349   virtual void carp(char *fmt, ...);
00351 };
00352 
00353 moot_END_NAMESPACE
00354 
00355 #endif // moot_EXPAT_ENABLED
00356 
00357 
00358 moot_BEGIN_NAMESPACE
00359 
00360 /*======================================================================
00361  * WRITER
00362  *======================================================================*/
00363 
00364 /*--------------------------------------------------------------------------
00365  * TokenWriterExpat
00366  */
00371 class TokenWriterExpat : public TokenWriter {
00372 public:
00373   /*----------------------------------------------------
00374    * TokenWriterExpat: Data
00375    */
00376   //----------------------------
00377   /* I/O Behavior */
00378 
00379   //----------------------------
00382 
00404   bool        use_raw_xml;
00405 
00406   std::string root_elt;      
00407   std::string eos_elt;       
00408   std::string token_elt;     
00409   std::string text_elt;      
00410   std::string analysis_elt;  
00411   std::string postag_attr;   
00412 
00413   std::string besttag_elt;   
00414 
00415 
00416   //----------------------------
00420   std::string    twx_encoding;
00421 
00423   mootXMLRecoder twx_recoder;
00424 
00426   int lastc;
00428  
00429  public:
00430   /*----------------------------------------------------
00431    * TokenReaderExpat: Constructor
00432    */
00435 
00444   TokenWriterExpat(int                   fmt         =tiofXML
00445                    , bool                got_raw_xml =false
00446                    , const std::string  &encoding    =""
00447                    );
00448 
00449   /*----------------------------------------------------
00450    * TokenWriterExpat: encoding
00451    */
00453   inline void setEncoding(const std::string &encoding="")
00454   {
00455     twx_encoding = encoding;
00456     twx_recoder.scan_request("UTF-8", (twx_encoding.empty()
00457                                        ? "XML-standalone"
00458                                        : twx_encoding));
00459   };
00460 
00461 
00462   /*----------------------------------------------------
00463    * TokenWriterExpat: Destructor
00464    */
00466   virtual ~TokenWriterExpat(void)
00467   {
00468     close();
00469   };
00470 
00471   /*----------------------------------------
00472    * Writer: Expat: Methods: Output Selection
00473    */
00477   virtual void to_mstream(mootio::mostream *os);
00478 
00480   virtual void close(void);
00482 
00483   /*----------------------------------------
00484    * Writer: Expat: Methods: Output
00485    */
00489   virtual void put_token(const mootToken &token) {
00490     _put_token(token,tw_ostream);
00491   };
00492 
00494   virtual void put_sentence(const mootSentence &sentence) {
00495     _put_sentence(sentence,tw_ostream);
00496   };
00497 
00499   virtual void put_comment_block_begin(void) {
00500     _put_comment_block_begin(tw_ostream);
00501   };
00502 
00504   virtual void put_comment_block_end(void) {
00505     _put_comment_block_end(tw_ostream);
00506   };
00507 
00509   virtual void put_raw_buffer(const char *buf, size_t len) {
00510     _put_raw_buffer(buf,len,tw_ostream);
00511   };
00513 
00514   /*----------------------------------------
00515    * Writer: Expat: Methods: Utilities
00516    */
00520   void _put_token_raw(const mootToken &token, mootio::mostream *os);
00521 
00523   void _put_token_gen(const mootToken &token, mootio::mostream *os);
00524 
00526   inline void _put_token(const mootToken &token, mootio::mostream *os)
00527   {
00528     if (use_raw_xml)  _put_token_raw(token,os);
00529     else              _put_token_gen(token,os);
00530   };
00531 
00533   inline void _put_sentence(const mootSentence &sentence, mootio::mostream *os)
00534   {
00535     if (!os || (tw_format&tiofNone) || !os->valid()) return;
00536     mootSentence::const_iterator si;
00537     if (use_raw_xml) {
00538       for (si = sentence.begin(); si != sentence.end(); si++) _put_token_raw(*si, os);
00539     } else {
00540       for (si = sentence.begin(); si != sentence.end(); si++) _put_token_gen(*si, os);
00541       _put_token_gen(mootToken(TokTypeEOS), os);
00542     }
00543   };
00544 
00546   void _put_comment_block_begin(mootio::mostream *os);
00547 
00549   void _put_comment_block_end(mootio::mostream *os);
00550 
00552   void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00554 };
00555 
00556 moot_END_NAMESPACE
00557 
00558 #endif // MOOT_EXPAT_TOKEN_IO_H

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913