Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootToken.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootToken.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token information
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _moot_TOKEN_H
00030 #define _moot_TOKEN_H
00031 
00032 #include <ctype.h>
00033 
00034 #include <list>
00035 #include <vector>
00036 #include <set>
00037 #include <string>
00038 
00043 #define MOOT_TNT_COMPAT 1
00044 //#undef MOOT_TNT_COMPAT
00045 
00046 namespace moot {
00047   using namespace std;
00048 
00049 /*----------------------------------------------------------------------
00050  * Basic Types
00051  *----------------------------------------------------------------------*/
00052 
00054 typedef string mootTagString;
00055 
00057 typedef string mootTokString;
00058 
00060 typedef set<mootTagString> mootTagSet;
00061 
00062 
00063 /*----------------------------------------------------------------------
00064  * Basic Token Types
00065  *----------------------------------------------------------------------*/
00066 enum mootTokenTypeE {
00067   /* Output token-types */
00068   TokTypeUnknown,   
00069   TokTypeVanilla,   
00070   TokTypeLibXML,    
00071   TokTypeXMLRaw,    
00072   TokTypeComment,   
00073   TokTypeEOS,       
00074   TokTypeEOF,       
00075   TokTypeUser,      
00076   NTokTypes         
00077 };
00078 typedef mootTokenTypeE mootTokenType;
00079 
00081 extern const char* mootTokenTypeNames[NTokTypes];
00082 
00083 /*--------------------------------------------------------------------------
00084  * mootToken
00085  *--------------------------------------------------------------------------*/
00089 class mootToken {
00090 public:
00091   /*---------------------------------------------------------------------
00092    * Embedded Types
00093    */
00094 
00096   typedef float Cost;
00097 
00099   class Analysis {
00100   public:
00102     //mootTokenType type;
00103 
00105     mootTagString tag;
00106 
00108     mootTagString details;
00109 
00110     /*--------------------------------------------------
00111      * Constructor / Destructor
00112      */
00114     Analysis(void)
00115     {};
00116 
00118     Analysis(const mootTagString &my_tag
00119              //, mootTokenType typ=TokTypeVanilla
00120              )
00121       : tag(my_tag)
00122       //, type(typ)
00123     {};
00124 
00126     Analysis(const mootTagString &my_tag,
00127              const mootTagString &my_details
00128              //, mootTokenType typ=TokTypeVanilla
00129              )
00130       : tag(my_tag)
00131       , details(my_details)
00132       //, type(typ)
00133     {};
00134 
00136     inline void clear(void) {
00137       tag.clear();
00138       details.clear();
00139     };
00140 
00142     inline bool empty(void) const {
00143       return tag.empty() && details.empty();
00144     }
00145 
00147     friend bool operator<(const Analysis &x, const Analysis &y)
00148     {
00149       int tcomp = x.tag.compare(y.tag);
00150       return (tcomp < 0
00151               ? true
00152               : (tcomp > 0
00153                  ? false
00154                  : x.details < y.details));
00155     };
00156 
00158     friend bool operator==(const Analysis &x, const Analysis &y)
00159     {
00160       return x.tag == y.tag && x.details == y.details;
00161     }
00162     
00163   }; //-- /mootToken::Analysis
00164 
00165 
00167   //typedef set<Analysis> AnalysisSet;
00168   //typedef vector<Analysis> Analyses;
00169   typedef list<Analysis> Analyses;
00170 
00171 public:
00172   /*---------------------------------------------------------------------*
00173    * Data Members
00174    */
00175 
00179   mootTokenType   tok_type;
00180 
00184   mootTokString   tok_text;
00185   
00189   mootTagString   tok_besttag;
00190 
00194   Analyses       tok_analyses;
00195 
00196 public:
00197   /*------------------------------------------------------------
00198    * Constructors / Destructors
00199    */
00201   mootToken(mootTokenType type=TokTypeVanilla)
00202     : tok_type(type)
00203   {};
00204 
00206   mootToken(const mootTokString &text, mootTokenType type=TokTypeVanilla)
00207     : tok_type(type),
00208       tok_text(text)
00209   {};
00210 
00212   mootToken(const mootTokString &text,
00213             const Analyses &analyses)
00214     : tok_type(TokTypeVanilla),
00215       tok_text(text),
00216       tok_analyses(analyses)
00217   {};
00218 
00220   mootToken(const mootTokString &text,
00221             const Analyses &analyses,
00222             const mootTagString &besttag)
00223     : tok_type(TokTypeVanilla),
00224       tok_text(text),
00225       tok_besttag(besttag),
00226       tok_analyses(analyses)
00227   {};
00228 
00230   /*
00231   mootToken(mootTokenType type=TokTypeVanilla, const char *text, size_t len)
00232     : tok_type(type),
00233       tok_text(text,len)
00234   {};
00235   */
00236 
00237   /* Destructor */
00238   ~mootToken(void) {};
00239 
00240   /*------------------------------------------------------------
00241    * Operators
00242    */
00244   friend bool operator==(const mootToken &x, const mootToken &y)
00245   {
00246     return
00247       x.tok_type == y.tok_type
00248       && x.tok_text == y.tok_text
00249       && x.tok_besttag == y.tok_besttag
00250       && x.tok_analyses == y.tok_analyses;
00251   };
00252 
00254   friend bool operator <(const mootToken &x, const mootToken &y)
00255   {
00256     return
00257       x.tok_text < y.tok_text
00258       || x.tok_besttag < y.tok_besttag
00259       || x.tok_analyses < y.tok_analyses;
00260   };
00261  
00262 
00263   /*------------------------------------------------------------
00264    * Manipulators: General
00265    */
00267   inline void clear(void) {
00268     tok_type = TokTypeVanilla;
00269     tok_text.clear();
00270     tok_besttag.clear();
00271     tok_analyses.clear();
00272   };
00273 
00274   /*------------------------------------------------------------
00275    * Manipulators: specific
00276    */
00278   inline const mootTokString &text(void) const {
00279     return tok_text;
00280   };
00282   inline mootTokString &text(const mootTokString &text) {
00283     tok_text = text;
00284     return tok_text;
00285   }; 
00287   inline mootTokString &text(const char *s, size_t len) {
00288     tok_text.assign(s,len);
00289     return tok_text;
00290   }; 
00292   inline mootTokString &textAppend(const mootTokString &text) {
00293     tok_text.append(text);
00294     return tok_text;
00295   };
00297   inline mootTokString &textAppend(const char *s, size_t len) {
00298     tok_text.append(s, len);
00299     return tok_text;
00300   };
00301 
00303   inline const mootTagString &besttag(void) const {
00304     return tok_besttag;
00305   };
00307   inline mootTagString &besttag(const mootTagString &besttag) {
00308     tok_besttag = besttag;
00309     return tok_besttag;
00310   };
00312   inline mootTagString &besttagAppend(const mootTagString &text) {
00313     tok_besttag.append(text);
00314     return tok_besttag;
00315   };
00317   inline mootTagString &besttagAppend(const char *s, size_t len) {
00318     tok_besttag.append(s, len);
00319     return tok_besttag;
00320   };
00321 
00323   inline mootTokenType toktype(void) const { return tok_type; }
00325   inline mootTokenType toktype(const mootTokenType type) {
00326     tok_type = type;
00327     return tok_type;
00328   };
00329 
00331   inline const Analyses &analyses(void) const { return tok_analyses; };
00333   inline const Analyses &analyses(const Analyses &analyses) {
00334     tok_analyses = analyses;
00335     return tok_analyses;
00336   };
00338   inline void insert(const Analysis &analysis)
00339   {
00340     //tok_analyses.insert(analysis);
00341     tok_analyses.push_back(analysis);
00342   };
00344   inline void insert(const mootTagString &tag, const mootTagString &details)
00345   {
00346     //insert(Analysis(tag,details));
00347     tok_analyses.push_back(Analysis());
00348     tok_analyses.back().tag = tag;
00349     tok_analyses.back().details = details;
00350   };
00352   inline void insert(const char *tag, const char *details)
00353   {
00354     //insert(Analysis(tag,details));
00355     tok_analyses.push_back(Analysis());
00356     tok_analyses.back().tag = tag;
00357     tok_analyses.back().details = details;
00358   };
00360   inline bool has_analysis_for_tag(const mootTagString &tag) const
00361   {
00362     for (Analyses::const_iterator asi = tok_analyses.begin();
00363          asi != tok_analyses.end();
00364          asi++)
00365       {
00366         if (asi->tag == tag) return true;
00367       }
00368     return false;
00369   };
00371   inline void erase(const Analysis &analysis)
00372   {
00373     for (Analyses::iterator asi = tok_analyses.begin();
00374          asi != tok_analyses.end();
00375          )
00376       {
00377         if (*asi == analysis) tok_analyses.erase(asi);
00378         else asi++;
00379       }
00380   };
00382   inline void prune(void)
00383   {
00384     for (Analyses::iterator asi = tok_analyses.begin();
00385          asi != tok_analyses.end();
00386          )
00387       {
00388         if (asi->tag != tok_besttag) tok_analyses.erase(asi);
00389         else asi++;
00390       }
00391   };
00392 
00393   /*------------------------------------------------------------
00394    * Compatibility
00395    */
00402   inline void tokImport(const mootTokString *src_toktext=NULL,
00403                         const mootTagSet    *src_tagset=NULL)
00404   {
00405     if (src_toktext) tok_text = *src_toktext;
00406     if (src_tagset) {
00407       for (mootTagSet::const_iterator tsi = src_tagset->begin();
00408            tsi != src_tagset->end();
00409            tsi++)
00410         {
00411           insert(Analysis(*tsi));
00412         }
00413     }
00414   };
00415 
00426   inline void tokExport(mootTokString *dst_toktext=NULL,
00427                         mootTagSet *dst_tagset=NULL,
00428                         bool want_besttag_in_tagset = true) const
00429   {
00430     if (dst_toktext) *dst_toktext = tok_text;
00431     if (dst_tagset) {
00432       for (Analyses::const_iterator asi = tok_analyses.begin();
00433            asi != tok_analyses.end();
00434            asi++
00435              //asi = upper_bound(asi->tag)
00436            )
00437         {
00438           dst_tagset->insert(asi->tag);
00439         }
00440       if (want_besttag_in_tagset && !tok_besttag.empty())
00441         dst_tagset->insert(tok_besttag);
00442     }
00443   };
00444   
00445 }; //-- /mootToken
00446 
00447 
00448 /*--------------------------------------------------------------------------
00449  * mootSentence
00450  *--------------------------------------------------------------------------*/
00451 
00453 typedef list<mootToken> mootSentence;
00454 
00456 //typedef vector<mootToken> mootSentence;
00457 
00458 /*----------------------------------------------------------------------
00459  * Pattern-based Typification
00460  *----------------------------------------------------------------------*/
00461 
00463 typedef enum {
00464   TokFlavorAlpha,      
00465   TokFlavorCard,       
00466   TokFlavorCardPunct,  
00467   TokFlavorCardSuffix, 
00468   TokFlavorCardSeps,   
00469   TokFlavorUnknown,    
00470   //TokFlavorSpecial,    /* A literal '@CARD', '@CARDPUNCT', etc. */
00471   NTokFlavors          
00472 } mootTokenFlavor;
00473 
00474 
00476 extern const char *mootTokenFlavorNames[NTokFlavors];
00477 
00479 inline bool tokenFlavor_isCardPunctChar(const char c) {
00480   return
00481 #if !defined(MOOT_TNT_COMPAT)
00482     (ispunct(c));             //-- This is the "right" way to do it
00483 #else
00484     (c=='.'||c==','||c=='-'); //-- ... but TnT seems to do it this way
00485 #endif // MOOT_TNT_COMPAT
00486 };
00487 
00489 inline bool tokenFlavor_isCardSuffixChar(const char c) {
00490   //bool answer = !tokenFlavor_isCardPunctChar(c);
00491   //bool answer = !ispunct(c);
00492   //fprintf(stderr, "tokenFlavor_isCardSuffixChar(%c)=%d\n", c, answer);
00493   //return answer;
00494   return true;
00495 };
00496 
00498 inline mootTokenFlavor tokenFlavor(const mootTokString &token)
00499 {
00500   mootTokString::const_iterator ti = token.begin();
00501   bool leading_punct = false;
00502 
00503   if (ti==token.end()) return TokFlavorAlpha;
00504   else if (tokenFlavor_isCardPunctChar(*ti)) {
00505     leading_punct = true;
00506     ti++;
00507   }
00508   
00509   if (!isdigit(*ti))
00510     return TokFlavorAlpha;
00511     
00512   //-- ^[:digit:]
00513   for (ti++; ti != token.end() && isdigit(*ti); ti++) {;}  //-- find first non-digit
00514   //-- ^([:digit:]+)
00515   
00516   if (ti == token.end()) {
00517     //-- ^([:digit:]+)$
00518     if (!leading_punct) return TokFlavorCard;
00519     else return TokFlavorCardSeps;
00520   }
00521 
00522   else if (tokenFlavor_isCardPunctChar(*ti)) {
00523     //-- ^([:digit:]+)([:CardPunct:])
00524 
00525     if (++ti == token.end())
00526       //-- ^([:digit:]+)([:CardPunct:])$
00527       return TokFlavorCardPunct;
00528 
00529     else if (isdigit(*ti)  || tokenFlavor_isCardPunctChar(*ti)) {
00530       //-- ^([:digit:]+)([:CardPunct:])([:digit:])
00531       for (ti++; ti != token.end() && (isdigit(*ti) || tokenFlavor_isCardPunctChar(*ti)); ti++) {;}
00532       //-- ^([:digit:]+)([:CardPunct:])([[:digit:][:CardPunct:]]+)
00533       if (ti == token.end())
00534         //-- ^([:digit:]+)([:CardPunct:])([[:digit:]|[:CardPunct:]]+)$
00535         return TokFlavorCardSeps;
00536     }
00537   }
00538 
00539 #if defined(MOOT_TNT_COMPAT)
00540   //-- allow only suffixes of length <= 3 characters
00541   for (int i=0 ; ti != token.end() && i < 3 ; ti++, i++) {;}
00542   //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)([^[:digit:][:CardPunct:]])(.{0,3})
00543 
00544   if (ti == token.end())
00545     //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)([^[:digit:][:CardPunct:]])(.{0,3})$
00546     return TokFlavorCardSuffix;
00547 
00548 #else // !defined(MOOT_TNT_COMPAT)
00549   //-- allow suffixes of arbitrary length
00550   //for ( ; ti != token.end() && tokenFlavor_isCardSuffixChar(*ti); ti++) {;}
00551   //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)(([^[:digit:][:CardPunct:]]+)?)([:CardSuffixChar:]*)
00552   return TokFlavorCardSuffix;
00553 
00554 #endif // MOOT_TNT_COMPAT
00555   
00556   return TokFlavorAlpha;
00557 };
00558 
00559 
00560 }; /* namespace moot */
00561 
00562 #endif /* _moot_TOKEN_H */

Generated on Wed Jul 28 15:48:03 2004 for libmoot by doxygen1.2.15