Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootTokenIO.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootTokenIO.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token I/O
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _moot_TOKEN_IO_H
00030 #define _moot_TOKEN_IO_H
00031 
00032 #include <mootToken.h>
00033 #include <mootTokenLexer.h>
00034 
00035 #include <mootIO.h>
00036 #include <mootCIO.h>
00037 #include <mootCxxIO.h>
00038 #include <mootBufferIO.h>
00039 
00040 #include <stdio.h>
00041 #include <stdarg.h>
00042 #include <stdexcept>
00043 
00044 /*moot_BEGIN_NAMESPACE*/
00045 namespace moot {
00046 
00047 /*==========================================================================
00048  * TokenIO
00049  *==========================================================================*/
00051 enum TokenIOFormatE {
00052   tiofNone      = 0x00000000,  
00053   tiofUnknown   = 0x00000001,  
00054   tiofNull      = 0x00000002,  
00055   tiofUser      = 0x00000004,  
00056   tiofNative    = 0x00000008,  
00057   tiofXML       = 0x00000010,  
00058   tiofConserve  = 0x00000020,  
00059   tiofPretty    = 0x00000040,  
00060   tiofText      = 0x00000080,  
00061   tiofAnalyzed  = 0x00000100,  
00062   tiofTagged    = 0x00000200,  
00063   tiofPruned    = 0x00000400,  
00064 };
00065 typedef TokenIOFormatE TokenIOFormat;
00066 
00068 static const int tiofRare = tiofText;
00069 
00071 static const int tiofMediumRare = tiofText|tiofAnalyzed;
00072 
00074 static const int tiofMedium = tiofText|tiofTagged;
00075 
00077 static const int tiofWellDone = tiofText|tiofAnalyzed|tiofTagged;
00078 
00080 /*
00081 enum TokenIOModeE {
00082   tiomNone,     
00083   tiomUnknown,  
00084   tiomFilename, 
00085   tiomFile,     
00086   tiomFd,       
00087   tiomCBuffer,  
00088   tiomCString,  
00089   tiomString,   
00090   tiomUser,     
00091   tiomNModes    
00092 };
00093 typedef TokenIOModeE TokenIOMode;
00094 */
00095 
00096 
00098 class TokenIO {
00099 public:
00100   /*--------------------------------------------------------------------*/
00102 
00103 
00111   static int parse_format_string(const std::string &fmtString);
00112 
00120   static int guess_filename_format(const char *filename);
00121 
00126   static bool is_empty_format(int fmt);
00127 
00131   static int sanitize_format(int fmt,
00132                              int fmt_implied=tiofNone,
00133                              int fmt_default=tiofNone);
00134 
00143   static int parse_format_request(const char *request,
00144                                   const char *filename=NULL,
00145                                   int fmt_implied=tiofNone,
00146                                   int fmt_default=tiofNone);
00147 
00149   static std::string format_canonical_string(int fmt);
00151 
00152   /*--------------------------------------------------------------------*/
00154 
00155 
00161   static class TokenReader *new_reader(int fmt);
00162 
00169   static class TokenWriter *new_writer(int fmt);
00171 };
00172 
00173 /*==========================================================================
00174  * TokenReader
00175  *==========================================================================*/
00176 
00178 class TokenReader : public TokenIO {
00179 public:
00181   static const size_t TR_DEFAULT_BUFSIZE = 256;
00182 
00183 public:
00185   int tr_format;
00186 
00188   std::string tr_name;
00189 
00191   mootio::mistream *tr_istream;
00192 
00194   bool tr_istream_created;
00195 
00204   mootToken *tr_token;
00205 
00214   mootSentence *tr_sentence;
00215 
00216 public:
00217   /*------------------------------------------------------------
00218    * TokenReader: Constructors
00219    */
00228   TokenReader(int                fmt  =tiofUnknown,
00229               const std::string &name ="TokenReader")
00230     : tr_format(fmt),
00231       tr_name(name),
00232       tr_istream(NULL),
00233       tr_istream_created(false),
00234       tr_token(NULL),
00235       tr_sentence(NULL)
00236   {};
00237 
00239   virtual ~TokenReader(void)
00240   {
00241     TokenReader::close();
00242   };
00243 
00247   inline void tr_clear(void)
00248   {
00249     if (tr_token) tr_token->clear();
00250     if (tr_sentence) tr_sentence->clear();
00251   };
00253 
00254 
00255   /*------------------------------------------------------------
00256    * TokenReader: Input Selection
00257    */
00260 
00266   virtual void from_mstream(mootio::mistream *mistreamp) {
00267     close();
00268     tr_istream = mistreamp;
00269     byte_number(1);
00270     line_number(1);
00271     column_number(0);
00272     tr_istream_created = false;
00273   };
00274 
00279   virtual void from_mstream(mootio::mistream &mis) {
00280     from_mstream(&mis);
00281   };
00282 
00289   virtual void from_filename(const char *filename)
00290   {
00291     from_mstream(new mootio::mifstream(filename,"rb"));
00292     tr_istream_created = true;
00293     if (!tr_istream->valid()) {
00294       carp("open failed for \"%s\": %s", filename, strerror(errno));
00295       close();
00296     }
00297   };
00298 
00305   virtual void from_file(FILE *file)
00306   {
00307     from_mstream(new mootio::micstream(file));
00308     tr_istream_created = true;
00309   };
00310 
00317   virtual void from_fd(int fd)
00318   {
00319     close();
00320     throw domain_error("from_fd(): not implemented");
00321   };
00322 
00329   virtual void from_buffer(const void *buf, size_t len)
00330   {
00331     from_mstream(new mootio::micbuffer(buf,len));
00332     tr_istream_created = true;
00333   };
00334 
00341   virtual void from_string(const char *s) {
00342     from_buffer(s,strlen(s));
00343   };
00344 
00351   virtual void from_cxxstream(std::istream &is)
00352   {
00353     from_mstream(new mootio::micxxstream(is));
00354     tr_istream_created = true;
00355   };
00356 
00365   virtual void close(void) {
00366     if (tr_istream_created) {
00367       tr_istream->close();
00368       if (tr_istream) delete tr_istream;
00369     }
00370     tr_istream_created = false;
00371     tr_istream = NULL;
00372   };
00374 
00375   /*------------------------------------------------------------
00376    * TokenReader: Token-Level Access
00377    */
00380 
00388   inline mootToken *token(void) { return tr_token; };
00389 
00397   inline mootSentence *sentence(void) { return tr_sentence; };
00398 
00404   virtual mootTokenType get_token(void) {
00405     throw domain_error("TokenReader: get_token() not implemented");
00406   };
00407 
00413   virtual mootTokenType get_sentence(void);
00415 
00416   /*------------------------------------------------------------
00417    * TokenReader: Diagnostics
00418    */
00423   virtual void reader_name(const std::string &myname) { tr_name = myname; };
00424 
00426   virtual size_t line_number(void) { return 0; };
00427 
00429   virtual size_t line_number(size_t n) { return n; };
00430 
00432   virtual size_t column_number(void) { return 0; };
00433 
00435   virtual size_t column_number(size_t n) { return n; };
00436 
00438   virtual size_t byte_number(void) { return 0; };
00439 
00441   virtual size_t byte_number(size_t n) { return n; };
00442 
00444   virtual void carp(const char *fmt, ...);
00446 };
00447 
00448 
00449 /*------------------------------------------------------------
00450  * TokenReaderNative
00451  */
00455 class TokenReaderNative : public TokenReader {
00456 public:
00457   /*----------------------------------------
00458    * Reader: Native: Data
00459    */
00461   mootTokenLexer lexer;
00462 
00464   mootSentence   trn_sentence;
00465 
00466 public:
00467   /*----------------------------------------
00468    * Reading: Native: Methods: Constructors
00469    */
00476   TokenReaderNative(int                fmt  =tiofWellDone,
00477                     const std::string &name ="TokenReaderNative")
00478     : TokenReader(fmt,name)
00479   {
00480     tr_format |= tiofNative;
00481     input_is_tagged(tr_format&tiofTagged);
00482 
00483     tr_sentence = &trn_sentence;
00484     tr_token    = &lexer.mtoken_default;
00485 
00486     lexer.to_file(stderr);
00487   };
00488 
00490   virtual ~TokenReaderNative(void)
00491   {
00492     close();
00493   };
00495 
00496   /*----------------------------------------
00497    * Reader: Native: Methods: Input Selection
00498    */
00500 
00501 
00502   virtual void from_mstream(mootio::mistream *mis);
00504 
00505 
00506   /*----------------------------------------
00507    * Reader: Native: Methods: Input
00508    */
00511   virtual mootTokenType get_token(void);
00512   virtual mootTokenType get_sentence(void);
00514 
00515 
00516   /*----------------------------------------
00517    * Reader: Native: Methods: Diagnostics
00518    */
00521 
00523   virtual size_t line_number(void) { return lexer.theLine; };
00524 
00526   virtual size_t line_number(size_t n) { return lexer.theLine = n; };
00527 
00529   virtual size_t column_number(void) { return lexer.theColumn; };
00530 
00532   virtual size_t column_number(size_t n) { return lexer.theColumn = n; };
00534 
00535 
00536   /*----------------------------------------
00537    * Reader: Native: Methods: New methods
00538    */
00546   inline bool input_is_tagged(void)
00547   {
00548     return lexer.first_analysis_is_best;
00549   };
00550 
00556   inline bool input_is_tagged(bool is_tagged)
00557   {
00558     if (is_tagged) {
00559       tr_format |= tiofTagged;
00560       lexer.first_analysis_is_best = true;
00561       lexer.ignore_first_analysis = true;
00562     } else {
00563       tr_format &= ~tiofTagged;
00564       lexer.first_analysis_is_best = false;
00565       lexer.ignore_first_analysis = false;
00566     }
00567     return is_tagged;
00568   };
00570 };
00571 
00572 
00573 /*==========================================================================
00574  * TokenWriter
00575  *==========================================================================*/
00576 
00577 /*------------------------------------------------------------
00578  * TokenWriter
00579  */
00581 class TokenWriter : public TokenIO {
00582 public:
00584   int tw_format;
00585 
00587   std::string tw_name;
00588 
00590   mootio::mostream *tw_ostream;
00591 
00593   bool tw_ostream_created;
00594 
00596   bool tw_is_comment_block;
00597 
00598 public:
00599   /*----------------------------------------
00600    * Writer: Methods
00601    */
00609   TokenWriter(int fmt=tiofWellDone,
00610               const std::string &name="TokenWriter")
00611     : tw_format(fmt),
00612       tw_name(name),
00613       tw_ostream(NULL),
00614       tw_ostream_created(false)
00615   {};
00616 
00618   virtual ~TokenWriter(void)
00619   {
00620     //close();
00621   };
00623 
00624   /*------------------------------------------------------------
00625    * Writer: Methods: Output Selection
00626    */
00629 
00635   virtual void to_mstream(mootio::mostream *mostreamp) {
00636     close();
00637     tw_ostream = mostreamp;
00638     if (!(tw_format&tiofNull) && (!tw_ostream || !tw_ostream->valid())) {
00639       carp("Warning: selecting output to invalid stream");
00640     }
00641     tw_ostream_created = false;
00642   };
00643 
00648   virtual void to_mstream(mootio::mostream &mos) {
00649     to_mstream(&mos);
00650   };
00651 
00657   virtual void to_filename(const char *filename)
00658   {
00659     to_mstream(new mootio::mofstream(filename,"wb"));
00660     tw_ostream_created = true;
00661     if (!tw_ostream->valid()) {
00662       carp("open failed for \"%s\": %s", filename, strerror(errno));
00663       close();
00664     }
00665   };
00666 
00673   virtual void to_file(FILE *file)
00674   {
00675     to_mstream(new mootio::mocstream(file));
00676     tw_ostream_created = true;
00677   };
00678 
00685   virtual void to_fd(int fd)
00686   {
00687     close();
00688     throw domain_error("to_fd(): not implemented.");
00689   };
00690 
00697   virtual void to_cxxstream(std::ostream &os)
00698   {
00699     to_mstream(new mootio::mocxxstream(os));
00700     tw_ostream_created = true;
00701   };
00702 
00711   virtual void close(void) {
00712     if (tw_is_comment_block) put_comment_block_end();
00713     if (tw_ostream && tw_ostream_created) {
00714       tw_ostream->close();
00715       delete tw_ostream;
00716     }
00717     tw_ostream_created = false; 
00718     tw_ostream = NULL;
00719   };
00721 
00722 
00723   /*----------------------------------------*/
00730   virtual void put_token(const mootToken &token) {
00731     throw domain_error("TokenWriter: put_token() not implemented");
00732   };
00733 
00739   virtual void put_sentence(const mootSentence &sentence)
00740   {
00741     for (mootSentence::const_iterator si = sentence.begin(); si != sentence.end(); si++)
00742       put_token(*si);
00743   };
00745 
00746   /*----------------------------------------*/
00754   virtual void put_comment_block_begin(void) {
00755     tw_is_comment_block = true;
00756   };
00757 
00763   virtual void put_comment_block_end(void) {
00764     tw_is_comment_block = false;
00765   };
00766 
00771   virtual void put_comment_buffer(const char *buf, size_t len) {
00772     put_comment_block_begin();
00773     put_raw_buffer(buf,len);
00774     put_comment_block_end();
00775   };
00776 
00781   virtual void put_comment(const char *s) {
00782     put_comment_buffer(s,strlen(s));
00783   };
00784 
00789   virtual void put_comment_buffer(const std::string &s) {
00790     put_comment_buffer(s.data(),s.size());
00791   };
00792 
00797   virtual void printf_comment(const char *fmt, ...);
00799 
00800   /*----------------------------------------*/
00807   virtual void put_raw_buffer(const char *buf, size_t len)
00808   {};
00813   virtual void put_raw(const char *s) {
00814     put_raw_buffer(s,strlen(s));
00815   };
00820   virtual void put_raw(const std::string &s) {
00821     put_raw_buffer(s.data(),s.size());
00822   };
00823 
00828   virtual void printf_raw(const char *fmt, ...);
00830 
00831   /*----------------------------------------*/
00836   virtual void writer_name(const std::string &myname) { tw_name = myname; };
00837 
00839   virtual void carp(const char *fmt, ...);
00841 };
00842 
00843 /*------------------------------------------------------------
00844  * TokenWriterNative
00845  */
00849 class TokenWriterNative : public TokenWriter {
00850 public:
00851   /*----------------------------------------
00852    * Writer: Native: Data
00853    */
00855   mootio::mocbuffer twn_tmpbuf;
00856 
00857 public:
00858   /*----------------------------------------
00859    * Writer: Native: Methods: construction
00860    */
00864   TokenWriterNative(int fmt=tiofWellDone,
00865                     const std::string name="TokenWriterNative")
00866     : TokenWriter(fmt,name)
00867   {
00868     if (! tw_format&tiofNative ) tw_format |= tiofNative;
00869   };
00870 
00872   virtual ~TokenWriterNative(void)
00873   {
00874     //TokenWriterNative::close();
00875   };
00877 
00878   /*----------------------------------------
00879    * Writer: Native: Methods: Output Selection
00880    */
00882   // @ {
00883 
00884   /*
00885    * Finish output to currently selected sink & perform any required
00886    * cleanup operations.
00887    * Used by named-file interface.
00888    */
00889   //virtual void close(void);
00890   // @ }
00891 
00892   /*----------------------------------------
00893    * Writer: Native: Methods: Output
00894    */
00897   virtual void put_token(const mootToken &token) {
00898     _put_token(token,tw_ostream);
00899   };
00900   virtual void put_sentence(const mootSentence &sentence) {
00901     _put_sentence(sentence,tw_ostream);
00902   };
00903 
00904   virtual void put_raw_buffer(const char *buf, size_t len) {
00905     _put_raw_buffer(buf,len,tw_ostream);
00906   };
00908 
00909   /*----------------------------------------
00910    * Writer: Native: Methods: Utilities
00911    */
00915   void _put_token(const mootToken &token, mootio::mostream *os);
00916 
00918   void _put_sentence(const mootSentence &sentence, mootio::mostream *os);
00919 
00921   void _put_comment(const char *buf, size_t len, mootio::mostream *os);
00922 
00924   void _put_raw_buffer(const char *buf, size_t len, mootio::mostream *os);
00925 
00929   inline std::string token2string(const mootToken &token)
00930   {
00931     twn_tmpbuf.clear();
00932     _put_token(token,&twn_tmpbuf);
00933     return std::string(twn_tmpbuf.data(), twn_tmpbuf.size());
00934   };
00935 
00939   inline std::string sentence2string(const mootSentence &sentence)
00940   {
00941     twn_tmpbuf.clear();
00942     _put_sentence(sentence,&twn_tmpbuf);
00943     return std::string(twn_tmpbuf.data(), twn_tmpbuf.size());
00944   };
00946 };
00947 
00948 }; /*moot_END_NAMESPACE*/
00949 
00950 #endif /* _moot_TOKEN_IO_H */

Generated on Wed Jul 28 15:48:03 2004 for libmoot by doxygen1.2.15