Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootToken.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2005 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootToken.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : token information
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _moot_TOKEN_H
00030 #define _moot_TOKEN_H
00031 
00032 #include <ctype.h>
00033 
00034 #include <list>
00035 #include <vector>
00036 #include <set>
00037 #include <string>
00038 #include <mootTypes.h>
00039 
00044 #define MOOT_TNT_COMPAT 1
00045 //#undef MOOT_TNT_COMPAT
00046 
00047 namespace moot {
00048   using namespace std;
00049 
00050 /*----------------------------------------------------------------------
00051  * Basic Types
00052  *----------------------------------------------------------------------*/
00053 
00055 typedef string mootTagString;
00056 
00058 typedef string mootTokString;
00059 
00061 typedef set<mootTagString> mootTagSet;
00062 
00063 
00064 /*----------------------------------------------------------------------
00065  * Basic Token Types
00066  *----------------------------------------------------------------------*/
00067 enum mootTokenTypeE {
00068   /* Output token-types */
00069   TokTypeUnknown,   
00070   TokTypeVanilla,   
00071   TokTypeLibXML,    
00072   TokTypeXMLRaw,    
00073   TokTypeComment,   
00074   TokTypeEOS,       
00075   TokTypeEOF,       
00076   TokTypeUser,      
00077   NTokTypes         
00078 };
00079 typedef mootTokenTypeE mootTokenType;
00080 
00082 extern const char* mootTokenTypeNames[NTokTypes];
00083 
00084 /*--------------------------------------------------------------------------
00085  * mootToken
00086  *--------------------------------------------------------------------------*/
00090 class mootToken {
00091 public:
00092   /*---------------------------------------------------------------------
00093    * Embedded Types
00094    */
00095 
00097   typedef float Cost;
00098 
00100   class Analysis {
00101   public:
00103     //mootTokenType type;
00104 
00106     mootTagString tag;
00107 
00109     mootTagString details;
00110 
00112     ProbT prob;
00113 
00114     /*--------------------------------------------------
00115      * Constructor / Destructor
00116      */
00118     Analysis(void)
00119         : prob(0)
00120     {};
00121 
00123     Analysis(const mootTagString &my_tag
00124              //, mootTokenType typ=TokTypeVanilla
00125              )
00126       : tag(my_tag),
00127         prob(0)
00128     {};
00129 
00131     Analysis(const mootTagString &my_tag,
00132              const mootTagString &my_details)
00133       : tag(my_tag),
00134         details(my_details),
00135         prob(0)
00136     {};
00137 
00139     Analysis(const mootTagString &my_tag,
00140              const mootTagString &my_details,
00141              const ProbT my_prob)
00142       : tag(my_tag),
00143         details(my_details),
00144         prob(my_prob)
00145     {};
00146 
00148     inline void clear(void) {
00149       tag.clear();
00150       details.clear();
00151       prob = 0;
00152     };
00153 
00155     inline bool empty(void) const {
00156       return tag.empty() && details.empty();
00157     }
00158 
00160     friend bool operator<(const Analysis &x, const Analysis &y)
00161     {
00162       if (x.prob != y.prob) return x.prob < y.prob;
00163       int tcomp = x.tag.compare(y.tag);
00164       return (tcomp < 0
00165               ? true
00166               : (tcomp > 0
00167                  ? false
00168                  : x.details < y.details));
00169     };
00170 
00172     friend bool operator==(const Analysis &x, const Analysis &y)
00173     {
00174       return x.prob == y.prob && x.tag == y.tag && x.details == y.details;
00175     }
00176     
00177   }; //-- /mootToken::Analysis
00178 
00179 
00181   //typedef set<Analysis> AnalysisSet;
00182   //typedef vector<Analysis> Analyses;
00183   typedef list<Analysis> Analyses;
00184 
00185 public:
00186   /*---------------------------------------------------------------------*
00187    * Data Members
00188    */
00189 
00193   mootTokenType   tok_type;
00194 
00198   mootTokString   tok_text;
00199   
00203   mootTagString   tok_besttag;
00204 
00208   Analyses       tok_analyses;
00209 
00210 public:
00211   /*------------------------------------------------------------
00212    * Constructors / Destructors
00213    */
00215   mootToken(mootTokenType type=TokTypeVanilla)
00216     : tok_type(type)
00217   {};
00218 
00220   mootToken(const mootTokString &text, mootTokenType type=TokTypeVanilla)
00221     : tok_type(type),
00222       tok_text(text)
00223   {};
00224 
00226   mootToken(const mootTokString &text,
00227             const Analyses &analyses)
00228     : tok_type(TokTypeVanilla),
00229       tok_text(text),
00230       tok_analyses(analyses)
00231   {};
00232 
00234   mootToken(const mootTokString &text,
00235             const Analyses &analyses,
00236             const mootTagString &besttag)
00237     : tok_type(TokTypeVanilla),
00238       tok_text(text),
00239       tok_besttag(besttag),
00240       tok_analyses(analyses)
00241   {};
00242 
00244   /*
00245   mootToken(mootTokenType type=TokTypeVanilla, const char *text, size_t len)
00246     : tok_type(type),
00247       tok_text(text,len)
00248   {};
00249   */
00250 
00251   /* Destructor */
00252   ~mootToken(void) {};
00253 
00254   /*------------------------------------------------------------
00255    * Operators
00256    */
00258   friend bool operator==(const mootToken &x, const mootToken &y)
00259   {
00260     return
00261       x.tok_type == y.tok_type
00262       && x.tok_text == y.tok_text
00263       && x.tok_besttag == y.tok_besttag
00264       && x.tok_analyses == y.tok_analyses;
00265   };
00266 
00268   friend bool operator <(const mootToken &x, const mootToken &y)
00269   {
00270     return
00271       x.tok_text < y.tok_text
00272       || x.tok_besttag < y.tok_besttag
00273       || x.tok_analyses < y.tok_analyses;
00274   };
00275  
00276 
00277   /*------------------------------------------------------------
00278    * Manipulators: General
00279    */
00281   inline void clear(void) {
00282     tok_type = TokTypeVanilla;
00283     tok_text.clear();
00284     tok_besttag.clear();
00285     tok_analyses.clear();
00286   };
00287 
00288   /*------------------------------------------------------------
00289    * Manipulators: specific
00290    */
00292   inline const mootTokString &text(void) const {
00293     return tok_text;
00294   };
00296   inline mootTokString &text(const mootTokString &text) {
00297     tok_text = text;
00298     return tok_text;
00299   }; 
00301   inline mootTokString &text(const char *s, size_t len) {
00302     tok_text.assign(s,len);
00303     return tok_text;
00304   }; 
00306   inline mootTokString &textAppend(const mootTokString &text) {
00307     tok_text.append(text);
00308     return tok_text;
00309   };
00311   inline mootTokString &textAppend(const char *s, size_t len) {
00312     tok_text.append(s, len);
00313     return tok_text;
00314   };
00315 
00317   inline const mootTagString &besttag(void) const {
00318     return tok_besttag;
00319   };
00321   inline mootTagString &besttag(const mootTagString &besttag) {
00322     tok_besttag = besttag;
00323     return tok_besttag;
00324   };
00326   inline mootTagString &besttagAppend(const mootTagString &text) {
00327     tok_besttag.append(text);
00328     return tok_besttag;
00329   };
00331   inline mootTagString &besttagAppend(const char *s, size_t len) {
00332     tok_besttag.append(s, len);
00333     return tok_besttag;
00334   };
00335 
00337   inline mootTokenType toktype(void) const { return tok_type; }
00339   inline mootTokenType toktype(const mootTokenType type) {
00340     tok_type = type;
00341     return tok_type;
00342   };
00343 
00345   inline const Analyses &analyses(void) const { return tok_analyses; };
00347   inline const Analyses &analyses(const Analyses &analyses) {
00348     tok_analyses = analyses;
00349     return tok_analyses;
00350   };
00352   inline void insert(const Analysis &analysis)
00353   {
00354     //tok_analyses.insert(analysis);
00355     tok_analyses.push_back(analysis);
00356   };
00358   inline void insert(const mootTagString &tag, const mootTagString &details)
00359   {
00360     //insert(Analysis(tag,details));
00361     tok_analyses.push_back(Analysis());
00362     tok_analyses.back().tag = tag;
00363     tok_analyses.back().details = details;
00364   };
00366   inline void insert(const char *tag, const char *details)
00367   {
00368     //insert(Analysis(tag,details));
00369     tok_analyses.push_back(Analysis());
00370     tok_analyses.back().tag = tag;
00371     tok_analyses.back().details = details;
00372   };
00374   inline bool has_analysis_for_tag(const mootTagString &tag) const
00375   {
00376     for (Analyses::const_iterator asi = tok_analyses.begin();
00377          asi != tok_analyses.end();
00378          asi++)
00379       {
00380         if (asi->tag == tag) return true;
00381       }
00382     return false;
00383   };
00385   inline void erase(const Analysis &analysis)
00386   {
00387     for (Analyses::iterator asi = tok_analyses.begin();
00388          asi != tok_analyses.end();
00389          )
00390       {
00391         if (*asi == analysis) tok_analyses.erase(asi);
00392         else asi++;
00393       }
00394   };
00396   inline void prune(void)
00397   {
00398     for (Analyses::iterator asi = tok_analyses.begin();
00399          asi != tok_analyses.end();
00400          )
00401       {
00402         if (asi->tag != tok_besttag) tok_analyses.erase(asi);
00403         else asi++;
00404       }
00405   };
00406 
00407   /*------------------------------------------------------------
00408    * Compatibility
00409    */
00416   inline void tokImport(const mootTokString *src_toktext=NULL,
00417                         const mootTagSet    *src_tagset=NULL)
00418   {
00419     if (src_toktext) tok_text = *src_toktext;
00420     if (src_tagset) {
00421       for (mootTagSet::const_iterator tsi = src_tagset->begin();
00422            tsi != src_tagset->end();
00423            tsi++)
00424         {
00425           insert(Analysis(*tsi));
00426         }
00427     }
00428   };
00429 
00440   inline void tokExport(mootTokString *dst_toktext=NULL,
00441                         mootTagSet *dst_tagset=NULL,
00442                         bool want_besttag_in_tagset = true) const
00443   {
00444     if (dst_toktext) *dst_toktext = tok_text;
00445     if (dst_tagset) {
00446       for (Analyses::const_iterator asi = tok_analyses.begin();
00447            asi != tok_analyses.end();
00448            asi++
00449              //asi = upper_bound(asi->tag)
00450            )
00451         {
00452           dst_tagset->insert(asi->tag);
00453         }
00454       if (want_besttag_in_tagset && !tok_besttag.empty())
00455         dst_tagset->insert(tok_besttag);
00456     }
00457   };
00458   
00459 }; //-- /mootToken
00460 
00461 
00462 /*--------------------------------------------------------------------------
00463  * mootSentence
00464  *--------------------------------------------------------------------------*/
00465 
00467 typedef list<mootToken> mootSentence;
00468 
00470 //typedef vector<mootToken> mootSentence;
00471 
00472 /*----------------------------------------------------------------------
00473  * Pattern-based Typification
00474  *----------------------------------------------------------------------*/
00475 
00477 typedef enum {
00478   TokFlavorAlpha,      
00479   TokFlavorCard,       
00480   TokFlavorCardPunct,  
00481   TokFlavorCardSuffix, 
00482   TokFlavorCardSeps,   
00483   TokFlavorUnknown,    
00484   //TokFlavorSpecial,    /* A literal '@CARD', '@CARDPUNCT', etc. */
00485   NTokFlavors          
00486 } mootTokenFlavor;
00487 
00488 
00490 extern const char *mootTokenFlavorNames[NTokFlavors];
00491 
00493 inline bool tokenFlavor_isCardPunctChar(const char c) {
00494   return
00495 #if !defined(MOOT_TNT_COMPAT)
00496     (ispunct(c));             //-- This is the "right" way to do it
00497 #else
00498     (c=='.'||c==','||c=='-'); //-- ... but TnT seems to do it this way
00499 #endif // MOOT_TNT_COMPAT
00500 };
00501 
00503 inline bool tokenFlavor_isCardSuffixChar(const char c) {
00504   //bool answer = !tokenFlavor_isCardPunctChar(c);
00505   //bool answer = !ispunct(c);
00506   //fprintf(stderr, "tokenFlavor_isCardSuffixChar(%c)=%d\n", c, answer);
00507   //return answer;
00508   return true;
00509 };
00510 
00512 inline mootTokenFlavor tokenFlavor(const mootTokString &token)
00513 {
00514   mootTokString::const_iterator ti = token.begin();
00515   bool leading_punct = false;
00516 
00517   if (ti==token.end()) return TokFlavorAlpha;
00518   else if (tokenFlavor_isCardPunctChar(*ti)) {
00519     leading_punct = true;
00520     ti++;
00521   }
00522   
00523   if (!isdigit(*ti))
00524     return TokFlavorAlpha;
00525     
00526   //-- ^[:digit:]
00527   for (ti++; ti != token.end() && isdigit(*ti); ti++) {;}  //-- find first non-digit
00528   //-- ^([:digit:]+)
00529   
00530   if (ti == token.end()) {
00531     //-- ^([:digit:]+)$
00532     if (!leading_punct) return TokFlavorCard;
00533     else return TokFlavorCardSeps;
00534   }
00535 
00536   else if (tokenFlavor_isCardPunctChar(*ti)) {
00537     //-- ^([:digit:]+)([:CardPunct:])
00538 
00539     if (++ti == token.end())
00540       //-- ^([:digit:]+)([:CardPunct:])$
00541       return TokFlavorCardPunct;
00542 
00543     else if (isdigit(*ti)  || tokenFlavor_isCardPunctChar(*ti)) {
00544       //-- ^([:digit:]+)([:CardPunct:])([:digit:])
00545       for (ti++; ti != token.end() && (isdigit(*ti) || tokenFlavor_isCardPunctChar(*ti)); ti++) {;}
00546       //-- ^([:digit:]+)([:CardPunct:])([[:digit:][:CardPunct:]]+)
00547       if (ti == token.end())
00548         //-- ^([:digit:]+)([:CardPunct:])([[:digit:]|[:CardPunct:]]+)$
00549         return TokFlavorCardSeps;
00550     }
00551   }
00552 
00553 #if defined(MOOT_TNT_COMPAT)
00554   //-- allow only suffixes of length <= 3 characters
00555   for (int i=0 ; ti != token.end() && i < 3 ; ti++, i++) {;}
00556   //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)([^[:digit:][:CardPunct:]])(.{0,3})
00557 
00558   if (ti == token.end())
00559     //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)([^[:digit:][:CardPunct:]])(.{0,3})$
00560     return TokFlavorCardSuffix;
00561 
00562 #else // !defined(MOOT_TNT_COMPAT)
00563   //-- allow suffixes of arbitrary length
00564   //for ( ; ti != token.end() && tokenFlavor_isCardSuffixChar(*ti); ti++) {;}
00565   //-- ^([:digit:]+)([[:digit:][:CardPunct]]*)(([^[:digit:][:CardPunct:]]+)?)([:CardSuffixChar:]*)
00566   return TokFlavorCardSuffix;
00567 
00568 #endif // MOOT_TNT_COMPAT
00569   
00570   return TokFlavorAlpha;
00571 };
00572 
00575 inline bool isTokFlavorName(const mootTokString &tokstr)
00576 {
00577   int i;
00578   for (i = 0; i < NTokFlavors; i++) {
00579     if (tokstr == mootTokenFlavorNames[i]) return true;
00580   }
00581   return (tokstr == "@USECASE");
00582 };
00583 
00584 }; /* namespace moot */
00585 
00586 #endif /* _moot_TOKEN_H */

Generated on Mon Sep 11 16:10:33 2006 for libmoot by doxygen1.2.18