Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootTokenIO.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootTokenIO.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token I/O
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _moot_TOKEN_IO_H
00030 #define _moot_TOKEN_IO_H
00031 
00032 #include <mootToken.h>
00033 #include <mootTokenLexer.h>
00034 
00035 #include <mootIO.h>
00036 #include <mootCIO.h>
00037 #include <mootCxxIO.h>
00038 #include <mootBufferIO.h>
00039 
00040 #include <stdio.h>
00041 #include <stdarg.h>
00042 #include <stdexcept>
00043 
00044 /*moot_BEGIN_NAMESPACE*/
00045 namespace moot {
00046 
00047 /*==========================================================================
00048  * TokenIO
00049  *==========================================================================*/
00051 enum TokenIOFormatE {
00052   tiofNone      = 0x00000000,  
00053   tiofUnknown   = 0x00000001,  
00054   tiofNull      = 0x00000002,  
00055   tiofUser      = 0x00000004,  
00056   tiofNative    = 0x00000008,  
00057   tiofXML       = 0x00000010,  
00058   tiofConserve  = 0x00000020,  
00059   tiofPretty    = 0x00000040,  
00060   tiofText      = 0x00000080,  
00061   tiofAnalyzed  = 0x00000100,  
00062   tiofTagged    = 0x00000200,  
00063   tiofPruned    = 0x00000400,  
00064 };
00065 typedef TokenIOFormatE TokenIOFormat;
00066 
00068 static const int tiofRare = tiofText;
00069 
00071 static const int tiofMediumRare = tiofText|tiofAnalyzed;
00072 
00074 static const int tiofMedium = tiofText|tiofTagged;
00075 
00077 static const int tiofWellDone = tiofText|tiofAnalyzed|tiofTagged;
00078 
00080 /*
00081 enum TokenIOModeE {
00082   tiomNone,     ///< no I/O
00083   tiomUnknown,  ///< unknown mode (dangerous)
00084   tiomFilename, ///< named file I/O
00085   tiomFile,     ///< FILE* I/O
00086   tiomFd,       ///< file descriptor I/O
00087   tiomCBuffer,  ///< C memory-buffer I/O
00088   tiomCString,  ///< NUL-terminated string I/O
00089   tiomString,   ///< STL string I/O
00090   tiomUser,     ///< some other user-defined I/O mode
00091   tiomNModes    ///< number of modes; not really an I/O mode itself
00092 };
00093 typedef TokenIOModeE TokenIOMode;
00094 */
00095 
00096 
00098 class TokenIO {
00099 public:
00100   /*--------------------------------------------------------------------*/
00102 
00103 
00111   static int parse_format_string(const std::string &fmtString);
00112 
00120   static int guess_filename_format(const char *filename);
00121 
00126   static bool is_empty_format(int fmt);
00127 
00131   static int sanitize_format(int fmt,
00132                              int fmt_implied=tiofNone,
00133                              int fmt_default=tiofNone);
00134 
00143   static int parse_format_request(const char *request,
00144                                   const char *filename=NULL,
00145                                   int fmt_implied=tiofNone,
00146                                   int fmt_default=tiofNone);
00147 
00149   static std::string format_canonical_string(int fmt);
00151 
00152   /*--------------------------------------------------------------------*/
00154 
00155 
00161   static class TokenReader *new_reader(int fmt);
00162 
00169   static class TokenWriter *new_writer(int fmt);
00171 };
00172 
00173 /*==========================================================================
00174  * TokenReader
00175  *==========================================================================*/
00176 
00178 class TokenReader : public TokenIO {
00179 public:
00181   static const size_t TR_DEFAULT_BUFSIZE = 256;
00182 
00183 public:
00185   int tr_format;
00186 
00188   std::string tr_name;
00189 
00191   mootio::mistream *tr_istream;
00192 
00194   bool tr_istream_created;
00195 
00204   mootToken *tr_token;
00205 
00214   mootSentence *tr_sentence;
00215 
00216 public:
00217   /*------------------------------------------------------------
00218    * TokenReader: Constructors
00219    */
00228   TokenReader(int                fmt  =tiofUnknown,
00229               const std::string &name ="TokenReader")
00230     : tr_format(fmt),
00231       tr_name(name),
00232       tr_istream(NULL),
00233       tr_istream_created(false),
00234       tr_token(NULL),
00235       tr_sentence(NULL)
00236   {};
00237 
00239   virtual ~TokenReader(void)
00240   {
00241     TokenReader::close();
00242   };
00243 
00247   inline void tr_clear(void)
00248   {
00249     if (tr_token) tr_token->clear();
00250     if (tr_sentence) tr_sentence->clear();
00251   };
00253 
00254 
00255   /*------------------------------------------------------------
00256    * TokenReader: Input Selection
00257    */
00260 
00266   virtual void from_mstream(mootio::mistream *mistreamp) {
00267     close();
00268     tr_istream = mistreamp;
00269     byte_number(1);
00270     line_number(1);
00271     column_number(0);
00272     tr_istream_created = false;
00273   };
00274 
00279   virtual void from_mstream(mootio::mistream &mis) {
00280     from_mstream(&mis);
00281   };
00282 
00289   virtual void from_filename(const char *filename)
00290   {
00291     from_mstream(new mootio::mifstream(filename,"rb"));
00292     tr_istream_created = true;
00293     if (!tr_istream->valid()) {
00294       carp("open failed for \"%s\": %s", filename, strerror(errno));
00295       close();
00296     }
00297   };
00298 
00305   virtual void from_file(FILE *file)
00306   {
00307     from_mstream(new mootio::micstream(file));
00308     tr_istream_created = true;
00309   };
00310 
00317   virtual void from_fd(int fd)
00318   {
00319     close();
00320     throw domain_error("from_fd(): not implemented");
00321   };
00322 
00329   virtual void from_buffer(const void *buf, size_t len)
00330   {
00331     from_mstream(new mootio::micbuffer(buf,len));
00332     tr_istream_created = true;
00333   };
00334 
00341   virtual void from_string(const char *s) {
00342     from_buffer(s,strlen(s));
00343   };
00344 
00351   virtual void from_cxxstream(std::istream &is)
00352   {
00353     from_mstream(new mootio::micxxstream(is));
00354     tr_istream_created = true;
00355   };
00356 
00365   virtual void close(void) {
00366     if (tr_istream_created) {
00367       tr_istream->close();
00368       if (tr_istream) delete tr_istream;
00369     }
00370     tr_istream_created = false;
00371     tr_istream = NULL;
00372   };
00374 
00375   /*------------------------------------------------------------
00376    * TokenReader: Token-Level Access
00377    */
00380 
00388   inline mootToken *token(void) { return tr_token; };
00389 
00397   inline mootSentence *sentence(void) { return tr_sentence; };
00398 
00404   virtual mootTokenType get_token(void) {
00405     throw domain_error("TokenReader: get_token() not implemented");
00406   };
00407 
00413   virtual mootTokenType get_sentence(void);
00415 
00416   /*------------------------------------------------------------
00417    * TokenReader: Diagnostics
00418    */
00423   virtual void reader_name(const std::string &myname) { tr_name = myname; };
00424 
00426   virtual size_t line_number(void) { return 0; };
00427 
00429   virtual size_t line_number(size_t n) { return n; };
00430 
00432   virtual size_t column_number(void) { return 0; };
00433 
00435   virtual size_t column_number(size_t n) { return n; };
00436 
00438   virtual size_t byte_number(void) { return 0; };
00439 
00441   virtual size_t byte_number(size_t n) { return n; };
00442 
00444   virtual void carp(const char *fmt, ...);
00446 };
00447 
00448 
00449 /*------------------------------------------------------------
00450  * TokenReaderNative
00451  */
00455 class TokenReaderNative : public TokenReader {
00456 public:
00457   /*----------------------------------------
00458    * Reader: Native: Data
00459    */
00461   mootTokenLexer lexer;
00462 
00464   mootSentence   trn_sentence;
00465 
00466 public:
00467   /*----------------------------------------
00468    * Reading: Native: Methods: Constructors
00469    */
00476   TokenReaderNative(int                fmt  =tiofWellDone,
00477                     const std::string &name ="TokenReaderNative")
00478     : TokenReader(fmt,name)
00479   {
00480     tr_format |= tiofNative;
00481     input_is_tagged(tr_format&tiofTagged);
00482 
00483     tr_sentence = &trn_sentence;
00484     tr_token    = &lexer.mtoken_default;
00485 
00486     lexer.to_file(stderr);
00487   };
00488 
00490   virtual ~TokenReaderNative(void)
00491   {
00492     close();
00493   };
00495 
00496   /*----------------------------------------
00497    * Reader: Native: Methods: Input Selection
00498    */
00500 
00501 
00502   virtual void from_mstream(mootio::mistream *mis);
00504 
00505 
00506   /*----------------------------------------
00507    * Reader: Native: Methods: Input
00508    */
00511   virtual mootTokenType get_token(void);
00512   virtual mootTokenType get_sentence(void);
00514 
00515 
00516   /*----------------------------------------
00517    * Reader: Native: Methods: Diagnostics
00518    */
00521 
00523   virtual size_t line_number(void) { return lexer.theLine; };
00524 
00526   virtual size_t line_number(size_t n) { return lexer.theLine = n; };
00527 
00529   virtual size_t column_number(void) { return lexer.theColumn; };
00530 
00532   virtual size_t column_number(size_t n) { return lexer.theColumn = n; };
00534 
00535 
00536   /*----------------------------------------
00537    * Reader: Native: Methods: New methods
00538    */
00546   inline bool input_is_tagged(void)
00547   {
00548     return lexer.first_analysis_is_best;
00549   };
00550 
00556   inline bool input_is_tagged(bool is_tagged)
00557   {
00558     if (is_tagged) {
00559       tr_format |= tiofTagged;
00560       lexer.first_analysis_is_best = true;
00561       lexer.ignore_first_analysis = true;
00562     } else {
00563       tr_format &= ~tiofTagged;
00564       lexer.first_analysis_is_best = false;
00565       lexer.ignore_first_analysis = false;
00566     }
00567     return is_tagged;
00568   };
00570 };
00571 
00572 
00573 /*==========================================================================
00574  * TokenWriter
00575  *==========================================================================*/
00576 
00577 /*------------------------------------------------------------
00578  * TokenWriter
00579  */
00581 class TokenWriter : public TokenIO {
00582 public:
00584   int tw_format;
00585 
00587   std::string tw_name;
00588 
00590   mootio::mostream *tw_ostream;
00591 
00593   bool tw_ostream_created;
00594 
00596   bool tw_is_comment_block;
00597 
00598 public:
00599   /*----------------------------------------
00600    * Writer: Methods
00601    */
00609   TokenWriter(int fmt=tiofWellDone,
00610               const std::string &name="TokenWriter")
00611     : tw_format(fmt),
00612       tw_name(name),
00613       tw_ostream(NULL),
00614       tw_ostream_created(false)
00615   {};
00616 
00618   virtual ~TokenWriter(void)
00619   {
00620     //close();
00621   };
00623 
00624   /*------------------------------------------------------------
00625    * Writer: Methods: Output Selection
00626    */
00629 
00635   virtual void to_mstream(mootio::mostream *mostreamp) {
00636     close();
00637     tw_ostream = mostreamp;
00638     if (!(tw_format&tiofNull) && (!tw_ostream || !tw_ostream->valid())) {
00639       carp("Warning: selecting output to invalid stream");
00640     }
00641     tw_ostream_created = false;
00642   };
00643 
00648   virtual void to_mstream(mootio::mostream &mos) {
00649     to_mstream(&mos);
00650   };
00651 
00657   virtual void to_filename(const char *filename)
00658   {
00659     to_mstream(new mootio::mofstream(filename,"wb"));
00660     tw_ostream_created = true;
00661     if (!tw_ostream->valid()) {
00662       carp("open failed for \"%s\": %s", filename, strerror(errno));
00663       close();
00664     }
00665   };
00666 
00673   virtual void to_file(FILE *file)
00674   {
00675     to_mstream(new mootio::mocstream(file));
00676     tw_ostream_created = true;
00677   };
00678 
00685   virtual void to_fd(int fd)
00686   {
00687     close();
00688     throw domain_error("to_fd(): not implemented.");
00689   };
00690 
00697   virtual void to_cxxstream(std::ostream &os)
00698   {
00699     to_mstream(new mootio::mocxxstream(os));
00700     tw_ostream_created = true;
00701   };
00702 
00711   virtual void close(void) {
00712     if (tw_is_comment_block) put_comment_block_end();
00713     if (tw_ostream && tw_ostream_created) {
00714       tw_ostream->close();
00715       delete tw_ostream;
00716     }
00717     tw_ostream_created = false; 
00718     tw_ostream = NULL;
00719   };
00721 
00722 
00723   /*----------------------------------------*/
00730   virtual void put_token(const mootToken &token) {
00731     throw domain_error("TokenWriter: put_token() not implemented");
00732   };
00733 
00739   virtual void put_sentence(const mootSentence &sentence)
00740   {
00741     for (mootSentence::const_iterator si = sentence.begin(); si != sentence.end(); si++)
00742       put_token(*si);
00743   };
00745 
00746   /*----------------------------------------*/
00754   virtual void put_comment_block_begin(void) {
00755     tw_is_comment_block = true;
00756   };
00757 
00763   virtual void put_comment_block_end(void) {
00764     tw_is_comment_block = false;
00765   };
00766 
00771   virtual void put_comment_buffer(const char *buf, size_t len) {
00772     put_comment_block_begin();
00773     put_raw_buffer(buf,len);
00774     put_comment_block_end();
00775   };
00776 
00781   virtual void put_comment(const char *s) {
00782     put_comment_buffer(s,strlen(s));
00783   };
00784 
00789   virtual void put_comment_buffer(const std::string &s) {
00790     put_comment_buffer(s.data(),s.size());
00791   };
00792 
00797   virtual void printf_comment(const char *fmt, ...);
00799 
00800   /*----------------------------------------*/
00807   virtual void put_raw_buffer(const char *buf, size_t len)
00808   {};
00813   virtual void put_raw(const char *s) {
00814     put_raw_buffer(s,strlen(s));
00815   };
00820   virtual void put_raw(const std::string &s) {
00821     put_raw_buffer(s.data(),s.size());
00822   };
00823 
00828   virtual void printf_raw(const char *fmt, ...);
00830 
00831   /*----------------------------------------*/
00836   virtual void writer_name(const std::string &myname) { tw_name = myname; };
00837 
00839   virtual void carp(const char *fmt, ...);
00841 };
00842 
00843 /*------------------------------------------------------------
00844  * TokenWriterNative
00845  */
00849 class TokenWriterNative : public TokenWriter {
00850 public:
00851   /*----------------------------------------
00852    * Writer: Native: Data
00853    */
00855   mootio::mocbuffer twn_tmpbuf;
00856 
00857 public:
00858   /*----------------------------------------
00859    * Writer: Native: Methods: construction
00860    */
00864   TokenWriterNative(int fmt=tiofWellDone,
00865                     const std::string name="TokenWriterNative")
00866     : TokenWriter(fmt,name)
00867   {
00868     if (! tw_format&tiofNative ) tw_format |= tiofNative;
00869   };
00870 
00872   virtual ~TokenWriterNative(void)
00873   {
00874     //TokenWriterNative::close();
00875   };
00877 
00878   /*----------------------------------------
00879    * Writer: Native: Methods: Output Selection
00880    */
00882   // @ {
00883 
00884   /*
00885    * Finish output to currently selected sink & perform any required
00886    * cleanup operations.
00887    * Used by named-file interface.
00888    */
00889   //virtual void close(void);
00890   // @ }
00891 
00892   /*----------------------------------------
00893    * Writer: Native: Methods: Output
00894    */
00897   virtual void put_token(const mootToken &token) {
00898     _put_token(token,tw_ostream);
00899   };
00900   virtual void put_sentence(const mootSentence &sentence) {
00901     _put_sentence(sentence,tw_ostream);
00902   };
00903 
00904   virtual void put_raw_buffer(const char *buf, size_t len) {
00905     _put_raw_buffer(buf,len,tw_ostream);
00906   };
00908 
00909   /*----------------------------------------
00910    * Writer: Native: Methods: Utilities
00911    */
00915   void _put_token(const mootToken &token, mootio::mostream *os);
00916 
00918   void _put_sentence(const mootSentence &sentence, mootio::mostream *os);
00919 
00921   void _put_comment(const char *buf, size_t len, mootio::mostream *os);
00922 
00924   void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00925 
00929   inline std::string token2string(const mootToken &token)
00930   {
00931     mostream *tw_ostream_old = tw_ostream;
00932     twn_tmpbuf.clear();
00933     tw_ostream = &twn_tmpbuf;
00934     _put_token(token,tw_ostream);
00935     std::string t2s(twn_tmpbuf.data(), twn_tmpbuf.size());
00936     tw_ostream = tw_ostream_old;
00937     return t2s;
00938   };
00939 
00943   inline std::string sentence2string(const mootSentence &sentence)
00944   {
00945     twn_tmpbuf.clear();
00946     _put_sentence(sentence,&twn_tmpbuf);
00947     return std::string(twn_tmpbuf.data(), twn_tmpbuf.size());
00948   };
00950 };
00951 
00952 }; /*moot_END_NAMESPACE*/
00953 
00954 #endif /* _moot_TOKEN_IO_H */

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913