Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootHMM.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootHMM.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moot PoS tagger : Hidden Markov Model (Disambiguator): headers
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_HMM_H
00030 #define _MOOT_HMM_H
00031 
00032 #ifdef __GNUC__
00033 # include <float.h>
00034 #endif // __GNUC__
00035 
00036 #include <string.h>
00037 #include <ctype.h>
00038 
00039 #include <mootTypes.h>
00040 #include <mootIO.h>
00041 #include <mootZIO.h>
00042 #include <mootToken.h>
00043 #include <mootTokenIO.h>
00044 #include <mootLexfreqs.h>
00045 #include <mootClassfreqs.h>
00046 #include <mootNgrams.h>
00047 #include <mootEnum.h>
00048 
00055 #define mootProbEpsilon  1.19209290E-06F
00056 /*
00057 #ifdef FLT_EPSILON
00058 //#define mootProbEpsilon FLT_EPSILON*10
00059 # define mootProbEpsilon  FLT_EPSILON*10
00060 #else
00061 # define mootProbEpsilon  1.19209290E-06F
00062 //#define mootProbEpsilon 1.19209290E-07F
00063 #endif
00064 */
00065 
00074 #define MOOT_PROB_NEG  -3E+38
00075 #define MOOT_PROB_ZERO -1E+38
00076 #define MOOT_PROB_ONE   0.0
00077 /*
00078 #ifdef FLT_MAX
00079 #  define MOOT_PROB_NEG  -FLT_MAX
00080 #  define MOOT_PROB_ZERO -1E+38
00081 #  define MOOT_PROB_ONE   0.0
00082 # else //-- !(DBL|FLT)_MAX
00083 #  define MOOT_PROB_NEG  -3E+38
00084 #  define MOOT_PROB_ZERO -1E+38
00085 #  define MOOT_PROB_ONE   0.0
00086 #endif //-- /(DBL|FLT)_MAX
00087 */
00088 
00095 #define MOOT_LEX_UNKNOWN_TOKENS
00096 //#undef MOOT_LEX_UNKNOWN_TOKENS
00097 
00123 #define MOOT_LEX_UNKNOWN_CLASSES
00124 //#undef MOOT_LEX_UNKNOWN_CLASSES
00125 
00131 //#define MOOT_VITERBI_DEBUG
00132 #undef MOOT_VITERBI_DEBUG
00133 
00134 
00135 moot_BEGIN_NAMESPACE
00136 
00137 /*--------------------------------------------------------------------------
00138  * mootHMM : HMM class
00139  *--------------------------------------------------------------------------*/
00140 
00147 class mootHMM {
00148 public:
00149   /*---------------------------------------------------------------------*/
00152 
00154   typedef enum {
00155     vlSilent,     
00156     vlErrors,     
00157     vlWarnings,   
00158     vlProgress,   
00159     vlEverything  
00160   } VerbosityLevel;
00161 
00162 
00164   typedef mootEnumID TagID;
00165 
00167   typedef mootEnumID TokID;
00168 
00173   typedef mootEnumID ClassID;
00175 
00176   /*------------------------------------------------------------
00177    * public typedefs : lexical classes
00178    */
00180 
00181 
00186   typedef set<TagID> LexClass;
00187 
00189   struct LexClassHash {
00190   public:
00191     inline size_t operator()(const LexClass &x) const {
00192       size_t hv = 0;
00193       for (LexClass::const_iterator xi = x.begin(); xi != x.end(); xi++) {
00194         hv = 5*hv + *xi;
00195       }
00196       return hv;
00197     };
00198   };
00200   struct LexClassEqual {
00201   public:
00202     inline size_t operator()(const LexClass &x, const LexClass &y) const {
00203       return x==y;
00204     };
00205   };
00207 
00208   /*---------------------------------------------------------------------*/
00211   
00213   typedef mootEnum<mootTagString,
00214                     hash<mootTagString>,
00215                     equal_to<mootTagString> >
00216           TagIDTable;
00217 
00219   typedef mootEnum<mootTokString,
00220                     hash<mootTokString>,
00221                     equal_to<mootTokString> >
00222           TokIDTable;
00223 
00225   typedef mootEnum<LexClass,
00226                    LexClassHash,
00227                    LexClassEqual>
00228           ClassIDTable;
00229 
00231   typedef map<TagID,ProbT> LexProbSubTable;
00232 
00237   typedef LexProbSubTable LexClassProbSubTable;
00238 
00242   typedef vector<LexProbSubTable> LexProbTable;
00243 
00258   typedef LexProbTable LexClassProbTable;
00259 
00271   typedef ProbT *BigramProbTable;
00272 
00273 #if defined(MOOT_USE_TRIGRAMS)
00274 # if defined(MOOT_HASH_TRIGRAMS)
00275 
00276   class Trigram {
00277   public:
00278 
00280     struct HashFcn {
00281     public:
00282       inline size_t operator()(const Trigram &x) const
00283       {
00284         return
00285           (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
00286       };
00287     };
00288 
00290     struct EqualFcn {
00291     public:
00292       inline size_t operator()(const Trigram &x, const Trigram &y) const
00293       {
00294         return 
00295           x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
00296         //x==y;
00297       };
00298     };
00299 
00300   public:
00301     TagID tag1;  
00302     TagID tag2;  
00303     TagID tag3;  
00304 
00305   public:
00307     Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
00308       : tag1(t1), tag2(t2), tag3(t3)
00309     {};
00310 
00312     ~Trigram(void) {};
00313   };
00314 
00317   typedef
00318     hash_map<Trigram,ProbT,
00319              Trigram::HashFcn,
00320              Trigram::EqualFcn>
00321     TrigramProbTable;
00322 
00323 # else 
00324 
00325 
00343   typedef ProbT* TrigramProbTable;
00344 # endif // MOOT_HASH_TRIGRAMS
00345 
00346 #endif // MOOT_USE_TRIGRAMS
00347 
00348 
00349   /*---------------------------------------------------------------------*/
00352 
00360   class ViterbiNode {
00361   public:
00362     TagID tagid;                  
00363 #ifdef MOOT_USE_TRIGRAMS
00364     TagID ptagid;                 
00365 #endif
00366     ProbT lprob;                  
00367 
00368     class ViterbiNode *pth_prev;  
00369     class ViterbiNode *nod_next;  
00370   };
00371 
00372 #ifdef MOOT_USE_TRIGRAMS
00373 
00378   class ViterbiRow {
00379   public:
00380     TagID  tagid;                 
00381     class ViterbiNode *nodes;     
00382     class ViterbiRow  *row_next;  
00383   };
00384 #else
00385   typedef ViterbiNode ViterbiRow; 
00386 #endif
00387 
00388 
00395   class ViterbiColumn {
00396   public:
00397     ViterbiRow    *rows;     
00398     ViterbiColumn *col_prev; 
00399   };
00400 
00417   struct ViterbiPathNode {
00418   public:
00419     ViterbiNode      *node;      
00420     ViterbiPathNode  *path_next; 
00421   };
00423 
00424 
00425 public:
00426   /*---------------------------------------------------------------------*/
00433   int verbose;
00434 
00439   size_t ndots;
00441 
00442   /*---------------------------------------------------------------------*/
00445 
00453   bool      use_lex_classes;
00454 
00461   TagID     start_tagid;
00462 
00471   ProbT     unknown_lex_threshhold;
00472 
00481   ProbT     unknown_class_threshhold;
00482 
00488   LexClass   uclass;
00490 
00491   /*---------------------------------------------------------------------*/
00494   ProbT             nglambda1;    
00495   ProbT             nglambda2;    
00496 #ifdef MOOT_USE_TRIGRAMS
00497   ProbT             nglambda3;    
00498 #endif
00499   ProbT             wlambda0;     
00500   ProbT             wlambda1;     
00502   ProbT             clambda0;     
00503   ProbT             clambda1;     
00510   ProbT             beamwd;
00512 
00513   /*---------------------------------------------------------------------*/
00516   TokIDTable        tokids;     
00517   TagIDTable        tagids;     
00518   ClassIDTable      classids;   
00520   /* TokenFlavor to TokenID lookup table for non-alphabetics */
00521   TokID             flavids[NTokFlavors];
00523 
00524   /*---------------------------------------------------------------------*/
00527   size_t            n_tags;     
00528   size_t            n_toks;     
00529   size_t            n_classes;  
00531   LexProbTable      lexprobs;   
00532   LexClassProbTable lcprobs;    
00533 #ifdef MOOT_USE_TRIGRAMS
00534   TrigramProbTable  ngprobs3;   
00535 #else
00536   BigramProbTable   ngprobs2;   
00537 #endif
00538 
00539 
00540   /*---------------------------------------------------------------------*/
00543   ViterbiColumn     *vtable;    
00545 
00546   /*---------------------------------------------------------------------*/
00549   size_t             nsents;      
00550   size_t             ntokens;     
00551   size_t             nnewtokens;  
00552   size_t             nunclassed;  
00553   size_t             nnewclasses; 
00554   size_t             nunknown;    
00555   size_t             nfallbacks;  
00557 
00558 protected:
00559   /*---------------------------------------------------------------------*/
00562   ViterbiNode     *trash_nodes;     
00563 #ifdef MOOT_USE_TRIGRAMS
00564   ViterbiRow      *trash_rows;      
00565 #endif
00566   ViterbiColumn   *trash_columns;   
00567   ViterbiPathNode *trash_pathnodes; 
00569 
00570   /*---------------------------------------------------------------------*/
00573   TagID             vtagid;     
00574   ProbT             vbestpr;    
00575   ProbT             vtagpr;     
00576   ProbT             vwordpr;    
00577   ViterbiNode      *vbestpn;    
00579   ViterbiPathNode  *vbestpath;  
00581   ProbT             bbestpr;   
00582   ProbT             bpprmin;   
00584 
00585 public:
00586   /*---------------------------------------------------------------------*/
00590   mootHMM(void);
00591 
00593   ~mootHMM(void) { clear(true,false); };
00595 
00596   /*------------------------------------------------------------*/
00604   void clear(bool wipe_everything=true, bool unlogify=false);
00606 
00607   /*------------------------------------------------------------*/
00611   bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
00612 
00614   bool save(mootio::mostream *obs, const char *filename=NULL);
00615 
00617   bool _bindump(mootio::mostream *obs, const char *filename=NULL);
00618 
00620   bool load(const char *filename=NULL);
00621 
00623   bool load(mootio::mistream *ibs, const char *filename=NULL);
00624 
00626   bool _binload(mootio::mistream *ibs, const char *filename=NULL);
00628 
00629   /*------------------------------------------------------------*/
00633   inline void unknown_token_name(const mootTokString &name)
00634   {
00635     tokids.unknown_name(name);
00636   };
00637 
00639   inline void unknown_tag_name(const mootTokString &name)
00640   {
00641     tagids.unknown_name(name);
00642   };
00643 
00644   /*
00645    * Set lexical class to use for tokens without user-specified analyses.
00646    * Really just an alias for 'uclass' datum.
00647    */
00648   inline void unknown_class_name(const mootTagSet &tagset)
00649   {
00650     tagset2lexclass(tagset,&uclass,false);
00651   };
00653 
00654 
00655   //------------------------------------------------------------
00670   bool load_model(const string &modelname,
00671                   const mootTagString &start_tag_str="__$",
00672                   const char *myname="mootHMM::load_model()",
00673                   bool  do_estimate_nglambdas=true,
00674                   bool  do_estimate_wlambdas=true,
00675                   bool  do_estimate_clambdas=true,
00676                   bool  do_compute_logprobs=true);
00677 
00683   bool compile(const mootLexfreqs &lexfreqs,
00684                const mootNgrams &ngrams,
00685                const mootClassfreqs &classfreqs,
00686                const mootTagString &start_tag_str="__$");
00687 
00689   void assign_ids_lf(const mootLexfreqs &lexfreqs);
00690 
00692   void assign_ids_ng(const mootNgrams   &ngrams);
00693 
00695   void assign_ids_cf(const mootClassfreqs &classfreqs);
00696 
00698   void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
00699 
00701   bool estimate_lambdas(const mootNgrams &ngrams);
00702 
00704   bool estimate_wlambdas(const mootLexfreqs &lf);
00705 
00707   bool estimate_clambdas(const mootClassfreqs &cf);
00708 
00710   bool compute_logprobs(void);
00712 
00713   //------------------------------------------------------------
00714   // Tagging: Top-level
00717 
00719   void tag_io(TokenReader *reader, TokenWriter *writer)
00720   {
00721     int rtok;
00722     mootSentence *sent;
00723     while (reader && (rtok = reader->get_sentence()) != TokTypeEOF) {
00724       sent = reader->sentence();
00725       if (!sent) continue;
00726       tag_sentence(*sent);
00727 #ifdef MOOT_VITERBI_DEBUG
00728       viterbi_txtdump(stderr);
00729 #endif
00730 
00731       if (writer) writer->put_sentence(*sent);
00732     }
00733   };
00734 
00736   //void tag_strings(int argc, char **argv, FILE *out=stdout);
00737 
00743   inline void tag_sentence(mootSentence &sentence) {
00744     viterbi_clear();
00745     for (mootSentence::const_iterator si = sentence.begin();
00746          si != sentence.end();
00747          si++)
00748       {
00749         viterbi_step(*si);
00750         if (ndots && (ntokens % ndots)==0) fputc('.', stderr);
00751       }
00752     viterbi_finish();
00753     tag_mark_best(sentence);
00754     nsents++;
00755   };
00757 
00758   /*====================================================================
00759    * VITERBI: Mid-level
00760    *====================================================================*/
00763 
00764   //------------------------------------------------------------
00765   // Viterbi: Mid-level: clear
00767   void viterbi_clear(void);
00768 
00769   //------------------------------------------------------------
00770   // Viterbi: single iteration: (mootToken)
00775   inline void viterbi_step(const mootToken &token) {
00776     if (token.toktype() != TokTypeVanilla) return; //-- ignore non-vanilla tokens
00777     ntokens++;
00778     LexClass tok_class;
00779     for (mootToken::Analyses::const_iterator ani = token.analyses().begin();
00780          ani != token.analyses().end();
00781          ani++)
00782       {
00783         tok_class.insert(tagids.name2id(ani->tag));
00784       }
00785     viterbi_step(token2id(token.text()), tok_class);
00786   };
00787 
00788   //------------------------------------------------------------
00789   // Viterbi: single iteration: (TokID,LexClass=set<ClassID>)
00795   inline void viterbi_step(TokID tokid, const LexClass &lexclass)
00796   {
00797     if (use_lex_classes) {
00798       if (lexclass.empty()) {
00799         nunclassed++;
00800         viterbi_step(tokid, 0, uclass);
00801       } else {
00802         //-- non-empty class : get ID (assign empty distribution if unknown)
00803         ClassID classid = class2id(lexclass,0,1);
00804         viterbi_step(tokid,classid,lexclass);
00805       }
00806     } else {
00807       //-- !use_lex_classes
00808       if (lexclass.empty()) {
00809         nunclassed++;
00810         viterbi_step(tokid);
00811       } else {
00812         viterbi_step(tokid,0,lexclass);
00813       }
00814     }
00815   };
00816 
00817   //------------------------------------------------------------
00818   // Viterbi: single iteration: (TokID,ClassID,LexClass)
00823   void viterbi_step(TokID tokid, ClassID classid, const LexClass &lclass);
00824 
00825   //------------------------------------------------------------
00826   // Viterbi: single iteration: (TokID)
00833   void viterbi_step(TokID tokid);
00834 
00835   //------------------------------------------------------------
00836   // Viterbi: single iteration: (TokString)
00843   inline void viterbi_step(const mootTokString &token_text) {
00844     return viterbi_step(token2id(token_text));
00845   };
00846 
00847   //------------------------------------------------------------
00848   // Viterbi: single iteration: (TokString,set<TagString>)
00854   inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
00855   {
00856     LexClass lclass;
00857     tagset2lexclass(tags,&lclass);
00858     viterbi_step(token2id(token_text), lclass);
00859   };
00860 
00861   //------------------------------------------------------------
00862   // Viterbi: single iteration: (TokID,TagID,col=NULL)
00866   void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
00867 
00868   //------------------------------------------------------------
00869   // Viterbi: single iteration: (TokString,TagString)
00875   inline void viterbi_step(const mootTokString &token, const mootTagString &tag)
00876   {
00877     return viterbi_step(token2id(token), tagids.name2id(tag));
00878   };
00879 
00880 
00881   //------------------------------------------------------------
00882   // Viterbi: finish
00886   inline void viterbi_finish(const TagID final_tagid)
00887   {
00888     viterbi_step(0, final_tagid);
00889   };
00890 
00894   inline void viterbi_finish(void)
00895   {
00896     viterbi_step(0, start_tagid);
00897   };
00898 
00909   void tag_mark_best(mootSentence &sentence);
00911 
00912 
00913   //------------------------------------------------------------
00914   // Viterbi: Low/Mid-level: path utilities
00917 
00919   inline ViterbiPathNode *viterbi_best_path(void)
00920   {
00921     return viterbi_node_path(viterbi_best_node());
00922   };
00923 
00925   inline ViterbiPathNode *viterbi_best_path(TagID tagid)
00926   {
00927     return viterbi_node_path(viterbi_best_node(tagid));
00928   };
00929 
00931   inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
00932   {
00933     return viterbi_best_path(tagids.name2id(tagstr));
00934   };
00935 
00942   inline ViterbiNode *viterbi_best_node(void)
00943   {
00944     ViterbiNode *pnod;
00945     vbestpr = MOOT_PROB_NEG;
00946     vbestpn = NULL;
00947 
00948 #ifdef MOOT_USE_TRIGRAMS
00949     ViterbiRow  *prow;
00950     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00951       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00952         if (pnod->lprob > vbestpr) {
00953           vbestpr = pnod->lprob;
00954           vbestpn = pnod;
00955         }
00956       }
00957     }
00958 #else // !MOOT_USE_TRIGRAMS
00959     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00960       if (pnod->lprob > vbestpr) {
00961         vbestpr = pnod->lprob;
00962         vbestpn = pnod;
00963       }
00964     }
00965 #endif // MOOT_USE_TRIGRAMS
00966     return vbestpn;
00967   };
00968 
00975   inline ViterbiNode *viterbi_best_node(TagID tagid)
00976   {
00977     ViterbiNode *pnod;
00978     vbestpr = MOOT_PROB_NEG;
00979 #ifdef MOOT_USE_TRIGRAMS
00980     ViterbiRow  *prow;
00981     vbestpn = NULL;
00982     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00983       if (prow->tagid == tagid) {
00984         for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00985           if (pnod->lprob > vbestpr) {
00986             vbestpr = pnod->lprob;
00987             vbestpn = pnod;
00988           }
00989         }
00990         return vbestpn;
00991       }
00992     }
00993 #else // !MOOT_USE_TRIGRAMS
00994     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00995       if (pnod->tagid == tagid) return pnod;
00996     }
00997 #endif // MOOT_USE_TRIGRAMS
00998     return NULL;
00999   };
01000  
01001   //------------------------------------------------------------
01002   // Viterbi: Low/Mid: node-to-path conversion
01010   inline ViterbiPathNode *viterbi_node_path(ViterbiNode *node)
01011   {
01012     viterbi_clear_bestpath();
01013     ViterbiPathNode *pnod; 
01014     for ( ; node != NULL; node = node->pth_prev) {
01015       pnod            = viterbi_get_pathnode();
01016       pnod->node      = node;
01017       pnod->path_next = vbestpath;
01018       vbestpath       = pnod;
01019     }
01020     return vbestpath;
01021   };
01023 
01024   //------------------------------------------------------------
01025   // public methods: low-level: Viterbi
01026 
01028   //{@
01029 
01031   inline bool viterbi_column_ok(const ViterbiColumn *col) const {
01032     return (col
01033             && col->rows 
01034 #ifdef MOOT_USE_TRIGRAMS
01035             && col->rows->nodes
01036 #endif
01037             );
01038   };
01039 
01049   inline ViterbiColumn *viterbi_populate_row(TagID curtagid,
01050                                              ProbT wordpr=MOOT_PROB_ONE,
01051                                              ViterbiColumn *col=NULL)
01052   {
01053 #ifdef MOOT_USE_TRIGRAMS
01054     ViterbiRow  *prow, *row = viterbi_get_row();
01055     ViterbiNode *pnod, *nod = NULL;
01056 
01057     if (!col) {
01058       col           = viterbi_get_column();
01059       col->rows     = NULL;
01060     }
01061     col->col_prev = vtable;
01062     row->nodes = NULL;
01063 
01064     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01065       vbestpr = MOOT_PROB_NEG;
01066       vbestpn = NULL;
01067 
01068       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01069         //-- beam pruning
01070         if (beamwd && pnod->lprob < bpprmin) continue;
01071 
01072         //-- probability lookup
01073         vtagpr = pnod->lprob + tagp(pnod->ptagid, prow->tagid, curtagid);
01074         if (vtagpr > vbestpr) {
01075           vbestpr = vtagpr;
01076           vbestpn = pnod;
01077         }
01078       }
01079 
01080       //-- set node information
01081       if (vbestpn != NULL) {
01082         nod = viterbi_get_node();
01083         nod->tagid    = curtagid;
01084         nod->ptagid   = prow->tagid;
01085         nod->lprob    = vbestpr + wordpr;
01086         //nod->row      = row;
01087         nod->pth_prev = vbestpn;
01088         nod->nod_next = row->nodes;
01089 
01090         row->nodes    = nod;
01091 
01092         //-- save beam information
01093         if (nod->lprob > bbestpr) bbestpr = nod->lprob;
01094       }
01095     }
01096 
01097     //-- set row information
01098     row->tagid    = curtagid;
01099     row->row_next = col->rows;
01100     col->rows     = row;
01101 
01102 #else 
01103 
01104     ViterbiNode *pnod, *nod = NULL;
01105 
01106     if (!col) {
01107       col           = viterbi_get_column();
01108       col->rows     = NULL;
01109     }
01110     col->col_prev = vtable;
01111 
01112     vbestpr = MOOT_PROB_NEG;
01113     vbestpn = NULL;
01114 
01115     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01116       //-- beam pruning
01117       if (beamwd && pnod->lprob < bpprmin) continue;
01118 
01119       //-- probability lookup
01120       vtagpr = pnod->lprob + tagp(pnod->tagid, curtagid);
01121       if (vtagpr > vbestpr) {
01122         vbestpr = vtagpr;
01123         vbestpn = pnod;
01124       }
01125     }
01126 
01127     //-- set node/row information
01128     nod           = viterbi_get_node();
01129     nod->tagid    = curtagid;
01130     nod->lprob    = vbestpr + wordpr;
01131     nod->pth_prev = vbestpn;
01132     nod->nod_next = col->rows;
01133 
01134     //-- set row/col information
01135     nod->nod_next = col->rows;
01136     col->rows     = nod;
01137 
01138     //-- save beam information
01139     if (nod->lprob > bbestpr) bbestpr = nod->lprob;
01140 
01141 #endif // MOOT_USE_TRIGRAMS
01142 
01143     return col;
01144   };
01145 
01146 
01147   //------------------------------------------------------------
01148   // Viterbi: Low-level: clear best-path
01150   inline void viterbi_clear_bestpath(void)
01151   {
01152     //-- move to trash: path-nodes
01153     ViterbiPathNode *pnod, *pnod_next;
01154     for (pnod = vbestpath; pnod != NULL; pnod = pnod_next) {
01155       pnod_next       = pnod->path_next;
01156       pnod->path_next = trash_pathnodes;
01157       trash_pathnodes = pnod;
01158     }
01159     vbestpath = NULL;
01160   };
01161 
01162   //------------------------------------------------------------
01163   // Viterbi: fallback
01169   void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
01171 
01172 
01173   //------------------------------------------------------------
01178   inline ViterbiNode *viterbi_get_node(void) {
01179     ViterbiNode *nod;
01180     if (trash_nodes != NULL) {
01181       nod         = trash_nodes;
01182       trash_nodes = nod->nod_next;
01183     } else {
01184       nod = new ViterbiNode();
01185     }
01186     return nod;
01187   };
01188 
01189   //------------------------------------------------------------
01190   // Viterbi: trash utilities: Rows
01192   inline ViterbiRow *viterbi_get_row(void) {
01193 #ifdef MOOT_USE_TRIGRAMS
01194     ViterbiRow *row;
01195     if (trash_rows != NULL) {
01196       row        = trash_rows;
01197       trash_rows = row->row_next;
01198     } else {
01199       row = new ViterbiRow();
01200     }
01201     return row;
01202 #else
01203     return viterbi_get_node();
01204 #endif //MOOT_USE_TRIGRAMS
01205   };
01206 
01207   //------------------------------------------------------------
01208   // Viterbi: trash utilities: columns
01210   inline ViterbiColumn *viterbi_get_column(void) {
01211     ViterbiColumn *col;
01212     if (trash_columns != NULL) {
01213       col           = trash_columns;
01214       trash_columns = col->col_prev;
01215     } else {
01216       col = new ViterbiColumn();
01217     }
01218     return col;
01219   };
01220 
01221   //------------------------------------------------------------
01222   // Viterbi: trash utilities: path-nodes
01224   inline ViterbiPathNode *viterbi_get_pathnode(void) {
01225     ViterbiPathNode *pnod;
01226     if (trash_pathnodes != NULL) {
01227       pnod            = trash_pathnodes;
01228       trash_pathnodes = pnod->path_next;
01229     } else {
01230       pnod = new ViterbiPathNode();
01231     }
01232     return pnod;
01233   };
01235 
01236 
01237 
01238   //------------------------------------------------------------
01239   // Low-level: ID Lookup
01243   inline TokID token2id(const mootTokString &token) const
01244   {
01245     mootTokenFlavor flav = tokenFlavor(token);
01246     return flavids[flav]==0 ? tokids.name2id(token) : flavids[flav];
01247   };
01248 
01259   inline LexClass *tagset2lexclass(const mootTagSet &tagset,
01260                                    LexClass *lclass=NULL,
01261                                    bool add_tagids=false)
01262   {
01263     if (!lclass) lclass = new LexClass();
01264     //-- ... for all tags in the class (utsi)
01265     for (mootTagSet::const_iterator tsi = tagset.begin();
01266          tsi != tagset.end();
01267          tsi++)
01268       {
01269         //-- lookup or assign a tag id
01270         TagID tagid = tagids.name2id(*tsi);
01271         if (add_tagids && tagid==0) tagid = tagids.insert(*tsi);
01272 
01273         //-- insert tagid into lexical class
01274         lclass->insert(tagid);
01275       }
01276     return lclass;
01277   };
01278 
01279 
01285   inline ClassID class2id(const LexClass &lclass,
01286                           bool autopopulate=true,
01287                           bool autocreate=true)
01288   {
01289     ClassID cid = classids.name2id(lclass);
01290     if (cid == 0) {
01291       nnewclasses++;
01292       if (!autopopulate && !autocreate) return cid;  //-- map unknown classes to zero
01293 
01294       //-- previously unknown class: fill 'er up with default values
01295       cid = classids.insert(lclass);
01296       if (cid >= lcprobs.size()) {
01297         n_classes = cid+1;
01298 
01299         //-- resize() should really happen 2 lines down,
01300         //   but that might break something : test this at some point!
01301         lcprobs.resize(n_classes);
01302       }
01303       if (autopopulate) {
01304         LexClassProbSubTable &lcps = lcprobs[cid];
01305         if (!lclass.empty()) {
01306           //-- non-empty class: restrict population to class-members
01307           ProbT lcprob = log(1.0/((ProbT)lclass.size()));
01308 
01309           for (LexClass::const_iterator lci = lclass.begin(); lci != lclass.end(); lci++) {
01310             lcps[*lci] = lcprob;
01311           }
01312         } else {
01313           //-- empty class: use class for "unknown" token instead [HACK!]
01314           const LexProbSubTable &lps = lexprobs[0];
01315           ProbT lpprob = log(1.0/((ProbT)lps.size()));
01316 
01317           for (LexProbSubTable::const_iterator lpsi = lps.begin(); lpsi != lps.end(); lpsi++) {
01318             lcps[lpsi->first] = lpprob;
01319           }
01320         }
01321       }
01322     }
01323     return cid;
01324   };
01326 
01327 
01328   //------------------------------------------------------------
01331 
01332   /*------------------------------------------------------------
01333    * Lexical Probability Lookup
01334    */
01339   inline const ProbT wordp(const TokID tokid, const TagID tagid) const
01340   {
01341     if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
01342     const LexProbSubTable &lps = lexprobs[tokid];
01343     LexProbSubTable::const_iterator lpsi = lps.find(tagid);
01344     return lpsi != lps.end() ? lpsi->second : MOOT_PROB_ZERO;
01345   };
01346 
01353   inline const ProbT wordp(const mootTokString token, const mootTagString tag) const
01354   {
01355     return wordp(token2id(token), tagids.name2id(tag));
01356   };
01357 
01358   /*------------------------------------------------------------
01359    * Lexical-Class Probability Lookup
01360    */
01364   inline const ProbT classp(const ClassID classid, const TagID tagid) const
01365   {
01366     if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
01367     const LexClassProbSubTable &lps = lcprobs[classid];
01368     LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
01369     return lpsi != lps.end() ? lpsi->second : MOOT_PROB_ZERO;
01370   };
01371 
01378   inline const ProbT classp(const LexClass &lclass, const mootTagString tag) const
01379   {
01380     return classp(classids.name2id(lclass), tagids.name2id(tag));
01381   };
01382 
01383   /*------------------------------------------------------------
01384    * Unigram Probability Lookup
01385    */
01389   inline const ProbT tagp(const TagID tagid) const
01390   {
01391     return
01392 #ifdef MOOT_USE_TRIGRAMS
01393       tagp(0,0,tagid);
01394 #else
01395       ngprobs2 && tagid < n_tags
01396       ? ngprobs2[tagid]
01397       : MOOT_PROB_ZERO;
01398 #endif // MOOT_USE_TRIGRAMS
01399   };
01400 
01406   inline const ProbT tagp(const mootTagString &tag) const
01407   {
01408     return tagp(tagids.name2id(tag));
01409   };
01410 
01411   /*------------------------------------------------------------
01412    * Bigram Probability Lookup
01413    */
01418   inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
01419   {
01420     return
01421 #ifdef MOOT_USE_TRIGRAMS
01422       tagp(0,prevtagid,tagid);
01423 #else
01424       ngprobs2 && prevtagid < n_tags && tagid < n_tags
01425       ? ngprobs2[(n_tags*prevtagid)+tagid]
01426       : MOOT_PROB_ZERO;
01427 #endif
01428   };
01429 
01435   inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
01436   {
01437     return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
01438   };
01439 
01440   /*------------------------------------------------------------
01441    * Trigram probability lookup
01442    */
01443 #ifdef MOOT_USE_TRIGRAMS
01444 
01450 #ifdef MOOT_HASH_TRIGRAMS
01451   inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
01452   {
01453     TrigramProbTable::const_iterator tgti = ngprobs3.find(trigram);
01454     return tgti != ngprobs3.end() ? tgti->second : ProbZero;
01455   };
01456 #endif //MOOT_HASH_TRIGRAMS
01457 
01464   inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
01465   {
01466     return
01467 #ifdef MOOT_HASH_TRIGRAMS
01468       tagp(Trigram(prevtagid2,prevtagid1,tagid))
01469 #else
01470       ngprobs3 && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
01471       ? ngprobs3[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
01472       : MOOT_PROB_ZERO;
01473 #endif
01474       ;
01475   };
01476 
01484   inline const ProbT tagp(const mootTagString &prevtag2,
01485                           const mootTagString &prevtag1,
01486                           const mootTagString &tag)
01487     const
01488   {
01489     return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
01490   };
01491 #endif // MOOT_USE_TRIGRAMS
01492 
01493 
01494 
01495   //------------------------------------------------------------
01496   // Error Reporting
01497 
01501   void carp(char *fmt, ...);
01503 
01504   //------------------------------------------------------------
01505   // public methods: low-level: debugging
01506 
01509 
01511   void txtdump(FILE *file);
01512 
01514   void viterbi_txtdump(FILE *file);
01516 };
01517 
01518 moot_END_NAMESPACE
01519 
01520 #endif /* _MOOT_HMM_H */

Generated on Wed Jul 28 15:48:02 2004 for libmoot by doxygen1.2.15