Main Page | Directories | File List

mootHMM.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2005 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootHMM.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moot PoS tagger : Hidden Markov Model (Disambiguator): headers
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_HMM_H
00030 #define _MOOT_HMM_H
00031 
00032 #ifdef HAVE_FLOAT_H
00033 # include <float.h>
00034 #endif /* HAVE_FLOAT_H */
00035 
00036 #include <string.h>
00037 #include <ctype.h>
00038 #include <math.h>
00039 
00040 #include <mootTypes.h>
00041 #include <mootIO.h>
00042 #include <mootZIO.h>
00043 #include <mootToken.h>
00044 #include <mootTokenIO.h>
00045 #include <mootLexfreqs.h>
00046 #include <mootClassfreqs.h>
00047 #include <mootNgrams.h>
00048 #include <mootEnum.h>
00049 #include <mootAssocVector.h>
00050 #include <mootSuffixTrie.h>
00051 
00083 #define MOOT_LEX_UNKNOWN_TOKENS
00084 //#undef MOOT_LEX_UNKNOWN_TOKENS
00085 
00092 #define MOOT_LEX_UNKNOWN_CLASSES
00093 //#undef MOOT_LEX_UNKNOWN_CLASSES
00094 
00101 #define MOOT_LEX_NONALPHA
00102 //#undef MOOT_LEX_NONALPHA
00103 
00108 #undef MOOT_LEX_IS_TIEBREAKER
00109 
00110 moot_BEGIN_NAMESPACE
00111 
00112 /*--------------------------------------------------------------------------
00113  * mootHMM : HMM class
00114  *--------------------------------------------------------------------------*/
00115 
00122 class mootHMM {
00123 public:
00124   /*---------------------------------------------------------------------*/
00127 
00129   typedef enum {
00130     vlSilent,     
00131     vlErrors,     
00132     vlWarnings,   
00133     vlProgress,   
00134     vlEverything  
00135   } VerbosityLevel;
00136 
00137 
00139   typedef mootEnumID TagID;
00140 
00142   typedef mootEnumID TokID;
00143 
00148   typedef mootEnumID ClassID;
00150 
00151   /*------------------------------------------------------------
00152    * public typedefs : lexical classes
00153    */
00155 
00156 
00161   typedef set<TagID> LexClass;
00162 
00164   struct LexClassHash {
00165   public:
00166     inline size_t operator()(const LexClass &x) const {
00167       size_t hv = 0;
00168       for (LexClass::const_iterator xi = x.begin(); xi != x.end(); xi++) {
00169         hv = 5*hv + *xi;
00170       }
00171       return hv;
00172     };
00173   };
00175   struct LexClassEqual {
00176   public:
00177     inline size_t operator()(const LexClass &x, const LexClass &y) const {
00178       return x==y;
00179     };
00180   };
00182 
00183   /*---------------------------------------------------------------------*/
00186   
00188   typedef mootEnum<mootTagString,
00189                     hash<mootTagString>,
00190                     equal_to<mootTagString> >
00191           TagIDTable;
00192 
00194   typedef mootEnum<mootTokString,
00195                     hash<mootTokString>,
00196                     equal_to<mootTokString> >
00197           TokIDTable;
00198 
00200   typedef mootEnum<LexClass,
00201                    LexClassHash,
00202                    LexClassEqual>
00203           ClassIDTable;
00204 
00206   //typedef map<TagID,ProbT> LexProbSubTable;
00207   typedef AssocVector<TagID,ProbT> LexProbSubTable;
00208 
00213   typedef LexProbSubTable LexClassProbSubTable;
00214 
00218   typedef vector<LexProbSubTable> LexProbTable;
00219 
00234   typedef LexProbTable LexClassProbTable;
00235 
00247   typedef ProbT *BigramProbTable;
00248 
00249 #if defined(MOOT_USE_TRIGRAMS)
00250 # if defined(MOOT_HASH_TRIGRAMS)
00251 
00252   class Trigram {
00253   public:
00254 
00256     struct HashFcn {
00257     public:
00258       inline size_t operator()(const Trigram &x) const
00259       {
00260         return
00261           (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
00262       };
00263     };
00264 
00266     struct EqualFcn {
00267     public:
00268       inline size_t operator()(const Trigram &x, const Trigram &y) const
00269       {
00270         return 
00271           x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
00272         //x==y;
00273       };
00274     };
00275 
00276   public:
00277     TagID tag1;  
00278     TagID tag2;  
00279     TagID tag3;  
00280 
00281   public:
00283     Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
00284       : tag1(t1), tag2(t2), tag3(t3)
00285     {};
00286 
00288     ~Trigram(void) {};
00289   };
00290 
00293   typedef
00294     hash_map<Trigram,ProbT,
00295              Trigram::HashFcn,
00296              Trigram::EqualFcn>
00297     TrigramProbTable;
00298 
00299 # else 
00300 
00301 
00319   typedef ProbT* TrigramProbTable;
00320 # endif // MOOT_HASH_TRIGRAMS
00321 
00322 #endif // MOOT_USE_TRIGRAMS
00323 
00324 
00325   /*---------------------------------------------------------------------*/
00328 
00336   class ViterbiNode {
00337   public:
00338     TagID tagid;                  
00339 #ifdef MOOT_USE_TRIGRAMS
00340     TagID ptagid;                 
00341 #else
00342     ProbT wprob;                  
00343 #endif
00344     ProbT lprob;                  
00345 
00346     class ViterbiNode *pth_prev;  
00347     class ViterbiNode *nod_next;  
00348   };
00349 
00350 #ifdef MOOT_USE_TRIGRAMS
00351 
00356   class ViterbiRow {
00357   public:
00358     TagID  tagid;                 
00359     ProbT  wprob;                 
00360     class ViterbiNode *nodes;     
00361     class ViterbiRow  *row_next;  
00362   };
00363 #else
00364   typedef ViterbiNode ViterbiRow; 
00365 #endif
00366 
00367 
00374   class ViterbiColumn {
00375   public:
00376     ViterbiRow    *rows;     
00377     ViterbiColumn *col_prev; 
00378     ProbT          bbestpr;  
00379     ProbT          bpprmin;  
00380   };
00381 
00398   struct ViterbiPathNode {
00399   public:
00400     ViterbiNode      *node;      
00401     ViterbiPathNode  *path_next; 
00402   };
00404 
00405 
00406 public:
00407   /*---------------------------------------------------------------------*/
00414   int verbose;
00415 
00420   size_t ndots;
00421 
00425   bool save_ambiguities;
00426 
00430   bool save_flavors;
00431 
00435   bool save_mark_unknown;
00436 
00440   bool save_dump_trellis;
00442 
00443   /*---------------------------------------------------------------------*/
00446 
00454   bool      use_lex_classes;
00455 
00462   TagID     start_tagid;
00463 
00472   ProbT     unknown_lex_threshhold;
00473 
00482   ProbT     unknown_class_threshhold;
00483 
00489   LexClass   uclass;
00491 
00492   /*---------------------------------------------------------------------*/
00495   ProbT             nglambda1;    
00496   ProbT             nglambda2;    
00497 #ifdef MOOT_USE_TRIGRAMS
00498   ProbT             nglambda3;    
00499 #endif
00500   ProbT             wlambda0;     
00501   ProbT             wlambda1;     
00503   ProbT             clambda0;     
00504   ProbT             clambda1;     
00511   ProbT             beamwd;
00513 
00514   /*---------------------------------------------------------------------*/
00517   TokIDTable        tokids;     
00518   TagIDTable        tagids;     
00519   ClassIDTable      classids;   
00521   /* TokenFlavor to TokenID lookup table for non-alphabetics */
00522   TokID             flavids[NTokFlavors];
00524 
00525   /*---------------------------------------------------------------------*/
00528   size_t            n_tags;     
00529   size_t            n_toks;     
00530   size_t            n_classes;  
00532   LexProbTable      lexprobs;   
00533   LexClassProbTable lcprobs;    
00534 #ifdef MOOT_USE_TRIGRAMS
00535   TrigramProbTable  ngprobs3;   
00536 #else
00537   BigramProbTable   ngprobs2;   
00538 #endif
00539 
00540   SuffixTrie        suftrie;    
00542 
00543   /*---------------------------------------------------------------------*/
00546   ViterbiColumn     *vtable;    
00548 
00549   /*---------------------------------------------------------------------*/
00552   size_t             nsents;      
00553   size_t             ntokens;     
00554   size_t             nnewtokens;  
00555   size_t             nunclassed;  
00556   size_t             nnewclasses; 
00557   size_t             nunknown;    
00558   size_t             nfallbacks;  
00560 
00561 protected:
00562   /*---------------------------------------------------------------------*/
00565   ViterbiNode     *trash_nodes;     
00566 #ifdef MOOT_USE_TRIGRAMS
00567   ViterbiRow      *trash_rows;      
00568 #endif
00569   ViterbiColumn   *trash_columns;   
00570   ViterbiPathNode *trash_pathnodes; 
00572 
00573   /*---------------------------------------------------------------------*/
00576   TagID             vtagid;     
00577   ProbT             vbestpr;    
00578   ProbT             vtagpr;     
00579   ProbT             vwordpr;    
00580   ViterbiNode      *vbestpn;    
00582   ViterbiPathNode  *vbestpath;  
00584   //ProbT           bbestpr;   /**< Best current (log-)probability for beam pruning */
00585   //ProbT           bpprmin;   /**< Minimum previous probability for beam pruning */
00587 
00588 public:
00589   /*---------------------------------------------------------------------*/
00593   mootHMM(void);
00594 
00596   ~mootHMM(void) { clear(true,false); };
00598 
00599   /*------------------------------------------------------------*/
00607   void clear(bool wipe_everything=true, bool unlogify=false);
00609 
00610   /*------------------------------------------------------------*/
00614   bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
00615 
00617   bool save(mootio::mostream *obs, const char *filename=NULL);
00618 
00620   bool _bindump(mootio::mostream *obs, const char *filename=NULL);
00621 
00623   bool load(const char *filename=NULL);
00624 
00626   bool load(mootio::mistream *ibs, const char *filename=NULL);
00627 
00629   bool _binload(mootio::mistream *ibs, const char *filename=NULL);
00631 
00632   /*------------------------------------------------------------*/
00636   inline void unknown_token_name(const mootTokString &name)
00637   {
00638     tokids.unknown_name(name);
00639   };
00640 
00642   inline void unknown_tag_name(const mootTokString &name)
00643   {
00644     tagids.unknown_name(name);
00645   };
00646 
00647   /*
00648    * Set lexical class to use for tokens without user-specified analyses.
00649    * Really just an alias for 'uclass' datum.
00650    */
00651   inline void unknown_class_name(const mootTagSet &tagset)
00652   {
00653     tagset2lexclass(tagset,&uclass,false);
00654   };
00656 
00657 
00658   //------------------------------------------------------------
00674   bool load_model(const string &modelname,
00675                   const mootTagString &start_tag_str="__$",
00676                   const char *myname="mootHMM::load_model()",
00677                   bool  do_estimate_nglambdas=true,
00678                   bool  do_estimate_wlambdas=true,
00679                   bool  do_estimate_clambdas=true,
00680                   bool  do_build_suffix_trie=true,
00681                   bool  do_compute_logprobs=true);
00682 
00688   bool compile(const mootLexfreqs &lexfreqs,
00689                const mootNgrams &ngrams,
00690                const mootClassfreqs &classfreqs,
00691                const mootTagString &start_tag_str="__$");
00692 
00694   void assign_ids_lf(const mootLexfreqs &lexfreqs);
00695 
00697   void assign_ids_ng(const mootNgrams   &ngrams);
00698 
00700   void assign_ids_cf(const mootClassfreqs &classfreqs);
00701 
00703   void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
00704 
00706   bool estimate_lambdas(const mootNgrams &ngrams);
00707 
00709   bool estimate_wlambdas(const mootLexfreqs &lf);
00710 
00712   bool estimate_clambdas(const mootClassfreqs &cf);
00713 
00715   bool build_suffix_trie(const mootLexfreqs &lf,
00716                          const mootNgrams   &ng,
00717                          bool  verbose=false)
00718   { return suftrie.build(lf,ng,tagids,start_tagid,verbose); };
00719 
00721   bool compute_logprobs(void);
00723 
00724   //------------------------------------------------------------
00725   // Tagging: Top-level
00728 
00730   void tag_io(TokenReader *reader, TokenWriter *writer)
00731   {
00732     int rtok;
00733     mootSentence *sent;
00734     while (reader && (rtok = reader->get_sentence()) != TokTypeEOF) {
00735       sent = reader->sentence();
00736       if (!sent) continue;
00737       tag_sentence(*sent);
00738 
00739 #ifdef MOOT_DEBUG_ENABLED
00740       if (save_dump_trellis) viterbi_txtdump(writer, sent->size()+1);
00741 #endif
00742 
00743       if (writer) writer->put_sentence(*sent);
00744     }
00745   };
00746 
00748   //void tag_strings(int argc, char **argv, FILE *out=stdout);
00749 
00755   inline void tag_sentence(mootSentence &sentence) {
00756     viterbi_clear();
00757     for (mootSentence::const_iterator si = sentence.begin();
00758          si != sentence.end();
00759          si++)
00760       {
00761         viterbi_step(*si);
00762         if (ndots && (ntokens % ndots)==0) fputc('.', stderr);
00763       }
00764     viterbi_finish();
00765     tag_mark_best(sentence);
00766     nsents++;
00767   };
00769 
00770   /*====================================================================
00771    * VITERBI: Mid-level
00772    *====================================================================*/
00775 
00776   //------------------------------------------------------------
00777   // Viterbi: Mid-level: clear
00779   void viterbi_clear(void);
00780 
00781   //------------------------------------------------------------
00782   // Viterbi: single iteration: (mootToken)
00787   inline void viterbi_step(const mootToken &token) {
00788     if (token.toktype() != TokTypeVanilla) return; //-- ignore non-vanilla tokens
00789     ntokens++;
00790     LexClass tok_class;
00791     for (mootToken::Analyses::const_iterator ani = token.analyses().begin();
00792          ani != token.analyses().end();
00793          ani++)
00794       {
00795         tok_class.insert(tagids.name2id(ani->tag));
00796       }
00797     viterbi_step(token2id(token.text()), tok_class, token.text());
00798   };
00799 
00800   //------------------------------------------------------------
00801   // Viterbi: single iteration: (TokID,LexClass=set<ClassID>)
00807   inline void viterbi_step(TokID tokid,
00808                            const LexClass &lexclass,
00809                            const mootTokString &toktext="")
00810   {
00811     if (use_lex_classes) {
00812       if (lexclass.empty()) {
00813         nunclassed++;
00814         viterbi_step(tokid, 0, uclass, toktext);
00815       } else {
00816         //-- non-empty class : get ID (assign empty distribution if unknown)
00817         ClassID classid = class2id(lexclass,0,1);
00818         viterbi_step(tokid, classid, lexclass, toktext);
00819       }
00820     } else {
00821       //-- !use_lex_classes
00822       if (lexclass.empty()) {
00823         nunclassed++;
00824         viterbi_step(tokid, toktext);
00825       } else {
00826         viterbi_step(tokid, 0, lexclass, toktext);
00827       }
00828     }
00829   };
00830 
00831   //------------------------------------------------------------
00832   // Viterbi: single iteration: (TokID,ClassID,LexClass)
00837   void viterbi_step(TokID tokid,
00838                     ClassID classid,
00839                     const LexClass &lclass,
00840                     const mootTokString &toktext="");
00841 
00842   //------------------------------------------------------------
00843   // Viterbi: single iteration: (TokID)
00850   void viterbi_step(TokID tokid, const mootTokString &toktext="");
00851 
00852   //------------------------------------------------------------
00853   // Viterbi: single iteration: (TokString)
00860   inline void viterbi_step(const mootTokString &token_text) {
00861     return viterbi_step(token2id(token_text), token_text);
00862   };
00863 
00864   //------------------------------------------------------------
00865   // Viterbi: single iteration: (TokString,set<TagString>)
00871   inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
00872   {
00873     LexClass lclass;
00874     tagset2lexclass(tags,&lclass);
00875     viterbi_step(token2id(token_text), lclass, token_text);
00876   };
00877 
00878   //------------------------------------------------------------
00879   // Viterbi: single iteration: (TokID,TagID,col=NULL)
00883   void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
00884 
00885   //------------------------------------------------------------
00886   // Viterbi: single iteration: (TokString,TagString)
00892   inline void viterbi_step(const mootTokString &toktext, const mootTagString &tag)
00893   {
00894     return viterbi_step(token2id(toktext), tagids.name2id(tag));
00895   };
00896 
00897 
00898   //------------------------------------------------------------
00899   // Viterbi: finish
00903   inline void viterbi_finish(const TagID final_tagid)
00904   {
00905     viterbi_step(0, final_tagid);
00906   };
00907 
00911   inline void viterbi_finish(void)
00912   {
00913     viterbi_step(0, start_tagid);
00914   };
00915 
00926   void tag_mark_best(mootSentence &sentence);
00928 
00929 
00930   //------------------------------------------------------------
00931   // Viterbi: Low/Mid-level: path utilities
00934 
00936   inline ViterbiPathNode *viterbi_best_path(void)
00937   {
00938     return viterbi_node_path(viterbi_best_node());
00939   };
00940 
00942   inline ViterbiPathNode *viterbi_best_path(TagID tagid)
00943   {
00944     return viterbi_node_path(viterbi_best_node(tagid));
00945   };
00946 
00948   inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
00949   {
00950     return viterbi_best_path(tagids.name2id(tagstr));
00951   };
00952 
00959   inline ViterbiNode *viterbi_best_node(void)
00960   {
00961     ViterbiNode *pnod;
00962     vbestpr = MOOT_PROB_NEG;
00963     vbestpn = NULL;
00964 
00965 #ifdef MOOT_USE_TRIGRAMS
00966     ViterbiRow  *prow;
00967     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00968       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00969         if (pnod->lprob > vbestpr) {
00970           vbestpr = pnod->lprob;
00971           vbestpn = pnod;
00972         }
00973       }
00974     }
00975 #else // !MOOT_USE_TRIGRAMS
00976     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00977       if (pnod->lprob > vbestpr) {
00978         vbestpr = pnod->lprob;
00979         vbestpn = pnod;
00980       }
00981     }
00982 #endif // MOOT_USE_TRIGRAMS
00983     return vbestpn;
00984   };
00985 
00992   inline ViterbiNode *viterbi_best_node(TagID tagid)
00993   {
00994     ViterbiNode *pnod;
00995     vbestpr = MOOT_PROB_NEG;
00996 #ifdef MOOT_USE_TRIGRAMS
00997     ViterbiRow  *prow;
00998     vbestpn = NULL;
00999     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01000       if (prow->tagid == tagid) {
01001         for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01002           if (pnod->lprob > vbestpr) {
01003             vbestpr = pnod->lprob;
01004             vbestpn = pnod;
01005           }
01006         }
01007         return vbestpn;
01008       }
01009     }
01010 #else // !MOOT_USE_TRIGRAMS
01011     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01012       if (pnod->tagid == tagid) return pnod;
01013     }
01014 #endif // MOOT_USE_TRIGRAMS
01015     return NULL;
01016   };
01017  
01018   //------------------------------------------------------------
01019   // Viterbi: Low/Mid: node-to-path conversion
01027   inline ViterbiPathNode *viterbi_node_path(ViterbiNode *node)
01028   {
01029     viterbi_clear_bestpath();
01030     ViterbiPathNode *pnod; 
01031     for ( ; node != NULL; node = node->pth_prev) {
01032       pnod            = viterbi_get_pathnode();
01033       pnod->node      = node;
01034       pnod->path_next = vbestpath;
01035       vbestpath       = pnod;
01036     }
01037     return vbestpath;
01038   };
01040 
01041   //------------------------------------------------------------
01042   // public methods: low-level: Viterbi
01043 
01045   //{@
01046 
01048   inline bool viterbi_column_ok(const ViterbiColumn *col) const {
01049     return (col
01050             && col->rows 
01051 #ifdef MOOT_USE_TRIGRAMS
01052             && col->rows->nodes
01053 #endif
01054             );
01055   };
01056 
01066   inline ViterbiColumn *viterbi_populate_row(TagID curtagid,
01067                                              ProbT wordpr=MOOT_PROB_ONE,
01068                                              ViterbiColumn *col=NULL,
01069                                              ProbT probmin=MOOT_PROB_NONE)
01070   {
01071 #ifdef MOOT_USE_TRIGRAMS
01072     ViterbiRow  *prow, *row = viterbi_get_row();
01073     ViterbiNode *pnod, *nod = NULL;
01074 
01075     if (!col) {
01076       col           = viterbi_get_column();
01077       col->rows     = NULL;
01078       col->bbestpr  = MOOT_PROB_NEG;
01079       if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01080       else        col->bpprmin = MOOT_PROB_NEG;
01081     }
01082     if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01083     col->col_prev = vtable;
01084     row->nodes = NULL;
01085     row->wprob = wordpr;
01086 
01087     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01088       vbestpr = MOOT_PROB_NEG;
01089       vbestpn = NULL;
01090 
01091       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01092         //-- beam pruning
01093         if (beamwd && pnod->lprob < col->bpprmin) continue;
01094 
01095         //-- probability lookup
01096         vtagpr = pnod->lprob + tagp(pnod->ptagid, prow->tagid, curtagid);
01097         if (vtagpr > vbestpr
01098 # ifdef MOOT_LEX_IS_TIEBREAKER
01099             || (vtagpr == vbestpr && wordpr > prow->wprob)
01100 # endif
01101             ) 
01102           {
01103             vbestpr = vtagpr;
01104             vbestpn = pnod;
01105           }
01106       }
01107 
01108       //-- set node information
01109       if (vbestpn != NULL) {
01110         nod = viterbi_get_node();
01111         nod->tagid    = curtagid;
01112         nod->ptagid   = prow->tagid;
01113         nod->lprob    = vbestpr + wordpr;
01114         nod->pth_prev = vbestpn;
01115         nod->nod_next = row->nodes;
01116 
01117         row->nodes    = nod;
01118 
01119         //-- save beam information
01120         if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01121       }
01122     }
01123 
01124     //-- set row information
01125     row->tagid    = curtagid;
01126     row->row_next = col->rows;
01127     col->rows     = row;
01128 
01129 #else 
01130 
01131     ViterbiNode *pnod, *nod = NULL;
01132 
01133     if (!col) {
01134       col           = viterbi_get_column();
01135       col->rows     = NULL;
01136       col->bbestpr  = MOOT_PROB_NEG;
01137       if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01138       else        col->bpprmin = MOOT_PROB_NEG;
01139     }
01140     if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01141     col->col_prev = vtable;
01142 
01143     vbestpr = MOOT_PROB_NEG;
01144     vbestpn = NULL;
01145 
01146     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01147       //-- beam pruning
01148       if (beamwd && pnod->lprob < col->bpprmin) continue;
01149 
01150       //-- probability lookup
01151       vtagpr = pnod->lprob + tagp(pnod->tagid, curtagid);
01152       if (vtagpr > vbestpr
01153 # ifdef MOOT_LEX_IS_TIEBREAKER
01154           || (vtagpr == vbestpr && wordpr > pnod->wprob)
01155 # endif
01156           )
01157         {
01158           vbestpr = vtagpr;
01159           vbestpn = pnod;
01160         }
01161     }
01162 
01163     //-- set node/row information
01164     nod           = viterbi_get_node();
01165     nod->tagid    = curtagid;
01166     nod->wprob    = wordpr;
01167     nod->lprob    = vbestpr + wordpr;
01168     nod->pth_prev = vbestpn;
01169     nod->nod_next = col->rows;
01170 
01171     //-- set row/col information
01172     nod->nod_next = col->rows;
01173     col->rows     = nod;
01174 
01175     //-- save beam information
01176     if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01177 
01178 #endif // MOOT_USE_TRIGRAMS
01179 
01180     return col;
01181   };
01182 
01183 
01184   //------------------------------------------------------------
01185   // Viterbi: Low-level: clear best-path
01187   inline void viterbi_clear_bestpath(void)
01188   {
01189     //-- move to trash: path-nodes
01190     ViterbiPathNode *pnod, *pnod_next;
01191     for (pnod = vbestpath; pnod != NULL; pnod = pnod_next) {
01192       pnod_next       = pnod->path_next;
01193       pnod->path_next = trash_pathnodes;
01194       trash_pathnodes = pnod;
01195     }
01196     vbestpath = NULL;
01197   };
01198 
01199   //------------------------------------------------------------
01200   // Viterbi: fallback
01206   void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
01208 
01209 
01210   //------------------------------------------------------------
01215   inline ViterbiNode *viterbi_get_node(void) {
01216     ViterbiNode *nod;
01217     if (trash_nodes != NULL) {
01218       nod         = trash_nodes;
01219       trash_nodes = nod->nod_next;
01220     } else {
01221       nod = new ViterbiNode();
01222     }
01223     return nod;
01224   };
01225 
01226   //------------------------------------------------------------
01227   // Viterbi: trash utilities: Rows
01229   inline ViterbiRow *viterbi_get_row(void) {
01230 #ifdef MOOT_USE_TRIGRAMS
01231     ViterbiRow *row;
01232     if (trash_rows != NULL) {
01233       row        = trash_rows;
01234       trash_rows = row->row_next;
01235     } else {
01236       row = new ViterbiRow();
01237     }
01238     return row;
01239 #else
01240     return viterbi_get_node();
01241 #endif //MOOT_USE_TRIGRAMS
01242   };
01243 
01244   //------------------------------------------------------------
01245   // Viterbi: trash utilities: columns
01247   inline ViterbiColumn *viterbi_get_column(void) {
01248     ViterbiColumn *col;
01249     if (trash_columns != NULL) {
01250       col           = trash_columns;
01251       trash_columns = col->col_prev;
01252     } else {
01253       col = new ViterbiColumn();
01254     }
01255     return col;
01256   };
01257 
01258   //------------------------------------------------------------
01259   // Viterbi: trash utilities: path-nodes
01261   inline ViterbiPathNode *viterbi_get_pathnode(void) {
01262     ViterbiPathNode *pnod;
01263     if (trash_pathnodes != NULL) {
01264       pnod            = trash_pathnodes;
01265       trash_pathnodes = pnod->path_next;
01266     } else {
01267       pnod = new ViterbiPathNode();
01268     }
01269     return pnod;
01270   };
01272 
01273 
01274 
01275   //------------------------------------------------------------
01276   // Low-level: ID Lookup
01280   inline TokID token2id(const mootTokString &token) const
01281   {
01282 #ifdef MOOT_LEX_NONALPHA
01283     TokID tokid = tokids.name2id(token);
01284     return tokid ? tokid : flavids[tokenFlavor(token)];
01285 #else
01286     mootTokenFlavor flav = tokenFlavor(token);
01287     return flavids[flav]==0 ? tokids.name2id(token) : flavids[flav];
01288 #endif
01289   };
01290 
01301   inline LexClass *tagset2lexclass(const mootTagSet &tagset,
01302                                    LexClass *lclass=NULL,
01303                                    bool add_tagids=false)
01304   {
01305     if (!lclass) lclass = new LexClass();
01306     //-- ... for all tags in the class (utsi)
01307     for (mootTagSet::const_iterator tsi = tagset.begin();
01308          tsi != tagset.end();
01309          tsi++)
01310       {
01311         //-- lookup or assign a tag id
01312         TagID tagid = tagids.name2id(*tsi);
01313         if (add_tagids && tagid==0) tagid = tagids.insert(*tsi);
01314 
01315         //-- insert tagid into lexical class
01316         lclass->insert(tagid);
01317       }
01318     return lclass;
01319   };
01320 
01321 
01327   inline ClassID class2id(const LexClass &lclass,
01328                           bool autopopulate=true,
01329                           bool autocreate=true)
01330   {
01331     ClassID cid = classids.name2id(lclass);
01332     if (cid == 0) {
01333       nnewclasses++;
01334       if (!autopopulate && !autocreate) return cid;  //-- map unknown classes to zero
01335 
01336       //-- previously unknown class: fill 'er up with default values
01337       cid = classids.insert(lclass);
01338       if (cid >= lcprobs.size()) {
01339         n_classes = cid+1;
01340 
01341         //-- resize() should really happen 2 lines down,
01342         //   but that might break something : test this at some point!
01343         lcprobs.resize(n_classes);
01344       }
01345       if (autopopulate) {
01346         LexClassProbSubTable &lcps = lcprobs[cid];
01347         if (!lclass.empty()) {
01348           //-- non-empty class: restrict population to class-members
01349           ProbT lcprob = log(1.0/((ProbT)lclass.size()));
01350 
01351           for (LexClass::const_iterator lci = lclass.begin(); lci != lclass.end(); lci++) {
01352             lcps[*lci] = lcprob;
01353           }
01354         } else {
01355           //-- empty class: use class for "unknown" token instead [HACK!]
01356           const LexProbSubTable &lps = lexprobs[0];
01357           ProbT lpprob = log(1.0/((ProbT)lps.size()));
01358 
01359           for (LexProbSubTable::const_iterator lpsi = lps.begin(); lpsi != lps.end(); lpsi++) {
01360             lcps[lpsi->key()] = lpprob;
01361           }
01362         }
01363       }
01364     }
01365     return cid;
01366   };
01368 
01369 
01370   //------------------------------------------------------------
01373 
01374   /*------------------------------------------------------------
01375    * Lexical Probability Lookup
01376    */
01378   /*
01379   inline void get_wordp_maps(TokID   tokid,
01380                              ClassID classid,
01381                              const mootTokString &toktext,
01382                              LexProbSubTable* const* primary,
01383                              LexProbSubTable* const* secondary)
01384     const
01385   {
01386     if (tokid != 0 || !use_lex_classes) {
01387       if (primary)   *primary   = &lexprobs[tokid];
01388       if (secondary) *secondary = NULL;
01389     }
01390     else if (use_lex_classes) {
01391       if (primary)   *primary   = &lcprobs[classid];
01392       if (secondary) {
01393         size_t matchlen;
01394         *secondary = &(suftrie.sufprobs(toktext,&matchlen));
01395         if (matchlen == 0) *secondary = NULL;
01396       }
01397     }
01398     else { //-- tokid==0 && !use_lex_classes
01399       if (primary) {
01400         size_t matchlen;
01401         *primary = &(suftrie.sufprobs(toktext,&matchlen));
01402         if (matchlen == 0) {
01403           *primary = &lexprobs[tokid];
01404           *secondary = NULL;
01405         }
01406         else if (secondary) {
01407           *secondary = &lexprobs[tokid];
01408         }
01409       }
01410     }
01411   };
01412   */
01413 
01414 
01419   inline const ProbT wordp(const TokID tokid, const TagID tagid) const
01420   {
01421     if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
01422     const LexProbSubTable &lps = lexprobs[tokid];
01423     LexProbSubTable::const_iterator lpsi = lps.find(tagid);
01424     return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01425   };
01426 
01433   inline const ProbT wordp(const mootTokString token, const mootTagString tag) const
01434   {
01435     return wordp(token2id(token), tagids.name2id(tag));
01436   };
01437 
01438   /*------------------------------------------------------------
01439    * Lexical-Class Probability Lookup
01440    */
01444   inline const ProbT classp(const ClassID classid, const TagID tagid) const
01445   {
01446     if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
01447     const LexClassProbSubTable &lps = lcprobs[classid];
01448     LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
01449     return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01450   };
01451 
01458   inline const ProbT classp(const LexClass &lclass, const mootTagString tag) const
01459   {
01460     return classp(classids.name2id(lclass), tagids.name2id(tag));
01461   };
01462 
01463   /*------------------------------------------------------------
01464    * Unigram Probability Lookup
01465    */
01469   inline const ProbT tagp(const TagID tagid) const
01470   {
01471     return
01472 #ifdef MOOT_USE_TRIGRAMS
01473       tagp(0,0,tagid);
01474 #else
01475       ngprobs2 && tagid < n_tags
01476       ? ngprobs2[tagid]
01477       : MOOT_PROB_ZERO;
01478 #endif // MOOT_USE_TRIGRAMS
01479   };
01480 
01486   inline const ProbT tagp(const mootTagString &tag) const
01487   {
01488     return tagp(tagids.name2id(tag));
01489   };
01490 
01491   /*------------------------------------------------------------
01492    * Bigram Probability Lookup
01493    */
01498   inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
01499   {
01500     return
01501 #ifdef MOOT_USE_TRIGRAMS
01502       tagp(0,prevtagid,tagid);
01503 #else
01504       ngprobs2 && prevtagid < n_tags && tagid < n_tags
01505       ? ngprobs2[(n_tags*prevtagid)+tagid]
01506       : MOOT_PROB_ZERO;
01507 #endif
01508   };
01509 
01515   inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
01516   {
01517     return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
01518   };
01519 
01520   /*------------------------------------------------------------
01521    * Trigram probability lookup
01522    */
01523 #ifdef MOOT_USE_TRIGRAMS
01524 
01530 #ifdef MOOT_HASH_TRIGRAMS
01531   inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
01532   {
01533     TrigramProbTable::const_iterator tgti = ngprobs3.find(trigram);
01534     return tgti != ngprobs3.end() ? tgti->second : ProbZero;
01535   };
01536 #endif //MOOT_HASH_TRIGRAMS
01537 
01544   inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
01545   {
01546     return
01547 #ifdef MOOT_HASH_TRIGRAMS
01548       tagp(Trigram(prevtagid2,prevtagid1,tagid))
01549 #else
01550       ngprobs3 && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
01551       ? ngprobs3[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
01552       : MOOT_PROB_ZERO;
01553 #endif
01554       ;
01555   };
01556 
01564   inline const ProbT tagp(const mootTagString &prevtag2,
01565                           const mootTagString &prevtag1,
01566                           const mootTagString &tag)
01567     const
01568   {
01569     return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
01570   };
01571 #endif // MOOT_USE_TRIGRAMS
01572 
01573 
01574 
01575   //------------------------------------------------------------
01576   // Error Reporting
01577 
01581   void carp(char *fmt, ...);
01583 
01584   //------------------------------------------------------------
01585   // public methods: low-level: debugging
01586 
01590   void txtdump(FILE *file);
01591 
01593   void viterbi_txtdump(TokenWriter *w, int ncols=0);
01594 
01596   void viterbi_txtdump_col(TokenWriter *w, ViterbiColumn *col, int colnum=0);
01598 };
01599 
01600 moot_END_NAMESPACE
01601 
01602 #endif /* _MOOT_HMM_H */

Generated on Sat Sep 17 01:20:33 2005 for libmoot by  doxygen 1.4.4