Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootHMM.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootHMM.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moot PoS tagger : Hidden Markov Model (Disambiguator): headers
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_HMM_H
00030 #define _MOOT_HMM_H
00031 
00032 #ifdef __GNUC__
00033 # include <float.h>
00034 #endif // __GNUC__
00035 
00036 #include <string.h>
00037 #include <ctype.h>
00038 
00039 #include <mootTypes.h>
00040 #include <mootIO.h>
00041 #include <mootZIO.h>
00042 #include <mootToken.h>
00043 #include <mootTokenIO.h>
00044 #include <mootLexfreqs.h>
00045 #include <mootClassfreqs.h>
00046 #include <mootNgrams.h>
00047 #include <mootEnum.h>
00048 #include <mootAssocVector.h>
00049 #include <mootSuffixTrie.h>
00050 
00082 #define MOOT_LEX_UNKNOWN_TOKENS
00083 //#undef MOOT_LEX_UNKNOWN_TOKENS
00084 
00091 #define MOOT_LEX_UNKNOWN_CLASSES
00092 //#undef MOOT_LEX_UNKNOWN_CLASSES
00093 
00100 #define MOOT_LEX_NONALPHA
00101 //#undef MOOT_LEX_NONALPHA
00102 
00107 #undef MOOT_LEX_IS_TIEBREAKER
00108 
00109 moot_BEGIN_NAMESPACE
00110 
00111 /*--------------------------------------------------------------------------
00112  * mootHMM : HMM class
00113  *--------------------------------------------------------------------------*/
00114 
00121 class mootHMM {
00122 public:
00123   /*---------------------------------------------------------------------*/
00126 
00128   typedef enum {
00129     vlSilent,     
00130     vlErrors,     
00131     vlWarnings,   
00132     vlProgress,   
00133     vlEverything  
00134   } VerbosityLevel;
00135 
00136 
00138   typedef mootEnumID TagID;
00139 
00141   typedef mootEnumID TokID;
00142 
00147   typedef mootEnumID ClassID;
00149 
00150   /*------------------------------------------------------------
00151    * public typedefs : lexical classes
00152    */
00154 
00155 
00160   typedef set<TagID> LexClass;
00161 
00163   struct LexClassHash {
00164   public:
00165     inline size_t operator()(const LexClass &x) const {
00166       size_t hv = 0;
00167       for (LexClass::const_iterator xi = x.begin(); xi != x.end(); xi++) {
00168         hv = 5*hv + *xi;
00169       }
00170       return hv;
00171     };
00172   };
00174   struct LexClassEqual {
00175   public:
00176     inline size_t operator()(const LexClass &x, const LexClass &y) const {
00177       return x==y;
00178     };
00179   };
00181 
00182   /*---------------------------------------------------------------------*/
00185   
00187   typedef mootEnum<mootTagString,
00188                     hash<mootTagString>,
00189                     equal_to<mootTagString> >
00190           TagIDTable;
00191 
00193   typedef mootEnum<mootTokString,
00194                     hash<mootTokString>,
00195                     equal_to<mootTokString> >
00196           TokIDTable;
00197 
00199   typedef mootEnum<LexClass,
00200                    LexClassHash,
00201                    LexClassEqual>
00202           ClassIDTable;
00203 
00205   //typedef map<TagID,ProbT> LexProbSubTable;
00206   typedef AssocVector<TagID,ProbT> LexProbSubTable;
00207 
00212   typedef LexProbSubTable LexClassProbSubTable;
00213 
00217   typedef vector<LexProbSubTable> LexProbTable;
00218 
00233   typedef LexProbTable LexClassProbTable;
00234 
00246   typedef ProbT *BigramProbTable;
00247 
00248 #if defined(MOOT_USE_TRIGRAMS)
00249 # if defined(MOOT_HASH_TRIGRAMS)
00250 
00251   class Trigram {
00252   public:
00253 
00255     struct HashFcn {
00256     public:
00257       inline size_t operator()(const Trigram &x) const
00258       {
00259         return
00260           (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
00261       };
00262     };
00263 
00265     struct EqualFcn {
00266     public:
00267       inline size_t operator()(const Trigram &x, const Trigram &y) const
00268       {
00269         return 
00270           x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
00271         //x==y;
00272       };
00273     };
00274 
00275   public:
00276     TagID tag1;  
00277     TagID tag2;  
00278     TagID tag3;  
00279 
00280   public:
00282     Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
00283       : tag1(t1), tag2(t2), tag3(t3)
00284     {};
00285 
00287     ~Trigram(void) {};
00288   };
00289 
00292   typedef
00293     hash_map<Trigram,ProbT,
00294              Trigram::HashFcn,
00295              Trigram::EqualFcn>
00296     TrigramProbTable;
00297 
00298 # else 
00299 
00300 
00318   typedef ProbT* TrigramProbTable;
00319 # endif // MOOT_HASH_TRIGRAMS
00320 
00321 #endif // MOOT_USE_TRIGRAMS
00322 
00323 
00324   /*---------------------------------------------------------------------*/
00327 
00335   class ViterbiNode {
00336   public:
00337     TagID tagid;                  
00338 #ifdef MOOT_USE_TRIGRAMS
00339     TagID ptagid;                 
00340 #else
00341     ProbT wprob;                  
00342 #endif
00343     ProbT lprob;                  
00344 
00345     class ViterbiNode *pth_prev;  
00346     class ViterbiNode *nod_next;  
00347   };
00348 
00349 #ifdef MOOT_USE_TRIGRAMS
00350 
00355   class ViterbiRow {
00356   public:
00357     TagID  tagid;                 
00358     ProbT  wprob;                 
00359     class ViterbiNode *nodes;     
00360     class ViterbiRow  *row_next;  
00361   };
00362 #else
00363   typedef ViterbiNode ViterbiRow; 
00364 #endif
00365 
00366 
00373   class ViterbiColumn {
00374   public:
00375     ViterbiRow    *rows;     
00376     ViterbiColumn *col_prev; 
00377     ProbT          bbestpr;  
00378     ProbT          bpprmin;  
00379   };
00380 
00397   struct ViterbiPathNode {
00398   public:
00399     ViterbiNode      *node;      
00400     ViterbiPathNode  *path_next; 
00401   };
00403 
00404 
00405 public:
00406   /*---------------------------------------------------------------------*/
00413   int verbose;
00414 
00419   size_t ndots;
00420 
00424   bool save_ambiguities;
00425 
00429   bool save_flavors;
00430 
00434   bool save_mark_unknown;
00435 
00439   bool save_dump_trellis;
00441 
00442   /*---------------------------------------------------------------------*/
00445 
00453   bool      use_lex_classes;
00454 
00461   TagID     start_tagid;
00462 
00471   ProbT     unknown_lex_threshhold;
00472 
00481   ProbT     unknown_class_threshhold;
00482 
00488   LexClass   uclass;
00490 
00491   /*---------------------------------------------------------------------*/
00494   ProbT             nglambda1;    
00495   ProbT             nglambda2;    
00496 #ifdef MOOT_USE_TRIGRAMS
00497   ProbT             nglambda3;    
00498 #endif
00499   ProbT             wlambda0;     
00500   ProbT             wlambda1;     
00502   ProbT             clambda0;     
00503   ProbT             clambda1;     
00510   ProbT             beamwd;
00512 
00513   /*---------------------------------------------------------------------*/
00516   TokIDTable        tokids;     
00517   TagIDTable        tagids;     
00518   ClassIDTable      classids;   
00520   /* TokenFlavor to TokenID lookup table for non-alphabetics */
00521   TokID             flavids[NTokFlavors];
00523 
00524   /*---------------------------------------------------------------------*/
00527   size_t            n_tags;     
00528   size_t            n_toks;     
00529   size_t            n_classes;  
00531   LexProbTable      lexprobs;   
00532   LexClassProbTable lcprobs;    
00533 #ifdef MOOT_USE_TRIGRAMS
00534   TrigramProbTable  ngprobs3;   
00535 #else
00536   BigramProbTable   ngprobs2;   
00537 #endif
00538 
00539   SuffixTrie        suftrie;    
00541 
00542   /*---------------------------------------------------------------------*/
00545   ViterbiColumn     *vtable;    
00547 
00548   /*---------------------------------------------------------------------*/
00551   size_t             nsents;      
00552   size_t             ntokens;     
00553   size_t             nnewtokens;  
00554   size_t             nunclassed;  
00555   size_t             nnewclasses; 
00556   size_t             nunknown;    
00557   size_t             nfallbacks;  
00559 
00560 protected:
00561   /*---------------------------------------------------------------------*/
00564   ViterbiNode     *trash_nodes;     
00565 #ifdef MOOT_USE_TRIGRAMS
00566   ViterbiRow      *trash_rows;      
00567 #endif
00568   ViterbiColumn   *trash_columns;   
00569   ViterbiPathNode *trash_pathnodes; 
00571 
00572   /*---------------------------------------------------------------------*/
00575   TagID             vtagid;     
00576   ProbT             vbestpr;    
00577   ProbT             vtagpr;     
00578   ProbT             vwordpr;    
00579   ViterbiNode      *vbestpn;    
00581   ViterbiPathNode  *vbestpath;  
00583   //ProbT           bbestpr;   /**< Best current (log-)probability for beam pruning */
00584   //ProbT           bpprmin;   /**< Minimum previous probability for beam pruning */
00586 
00587 public:
00588   /*---------------------------------------------------------------------*/
00592   mootHMM(void);
00593 
00595   ~mootHMM(void) { clear(true,false); };
00597 
00598   /*------------------------------------------------------------*/
00606   void clear(bool wipe_everything=true, bool unlogify=false);
00608 
00609   /*------------------------------------------------------------*/
00613   bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
00614 
00616   bool save(mootio::mostream *obs, const char *filename=NULL);
00617 
00619   bool _bindump(mootio::mostream *obs, const char *filename=NULL);
00620 
00622   bool load(const char *filename=NULL);
00623 
00625   bool load(mootio::mistream *ibs, const char *filename=NULL);
00626 
00628   bool _binload(mootio::mistream *ibs, const char *filename=NULL);
00630 
00631   /*------------------------------------------------------------*/
00635   inline void unknown_token_name(const mootTokString &name)
00636   {
00637     tokids.unknown_name(name);
00638   };
00639 
00641   inline void unknown_tag_name(const mootTokString &name)
00642   {
00643     tagids.unknown_name(name);
00644   };
00645 
00646   /*
00647    * Set lexical class to use for tokens without user-specified analyses.
00648    * Really just an alias for 'uclass' datum.
00649    */
00650   inline void unknown_class_name(const mootTagSet &tagset)
00651   {
00652     tagset2lexclass(tagset,&uclass,false);
00653   };
00655 
00656 
00657   //------------------------------------------------------------
00673   bool load_model(const string &modelname,
00674                   const mootTagString &start_tag_str="__$",
00675                   const char *myname="mootHMM::load_model()",
00676                   bool  do_estimate_nglambdas=true,
00677                   bool  do_estimate_wlambdas=true,
00678                   bool  do_estimate_clambdas=true,
00679                   bool  do_build_suffix_trie=true,
00680                   bool  do_compute_logprobs=true);
00681 
00687   bool compile(const mootLexfreqs &lexfreqs,
00688                const mootNgrams &ngrams,
00689                const mootClassfreqs &classfreqs,
00690                const mootTagString &start_tag_str="__$");
00691 
00693   void assign_ids_lf(const mootLexfreqs &lexfreqs);
00694 
00696   void assign_ids_ng(const mootNgrams   &ngrams);
00697 
00699   void assign_ids_cf(const mootClassfreqs &classfreqs);
00700 
00702   void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
00703 
00705   bool estimate_lambdas(const mootNgrams &ngrams);
00706 
00708   bool estimate_wlambdas(const mootLexfreqs &lf);
00709 
00711   bool estimate_clambdas(const mootClassfreqs &cf);
00712 
00714   bool build_suffix_trie(const mootLexfreqs &lf,
00715                          const mootNgrams   &ng,
00716                          bool  verbose=false)
00717   { return suftrie.build(lf,ng,tagids,start_tagid,verbose); };
00718 
00720   bool compute_logprobs(void);
00722 
00723   //------------------------------------------------------------
00724   // Tagging: Top-level
00727 
00729   void tag_io(TokenReader *reader, TokenWriter *writer)
00730   {
00731     int rtok;
00732     mootSentence *sent;
00733     while (reader && (rtok = reader->get_sentence()) != TokTypeEOF) {
00734       sent = reader->sentence();
00735       if (!sent) continue;
00736       tag_sentence(*sent);
00737 
00738 #ifdef MOOT_DEBUG_ENABLED
00739       if (save_dump_trellis) viterbi_txtdump(writer, sent->size()+1);
00740 #endif
00741 
00742       if (writer) writer->put_sentence(*sent);
00743     }
00744   };
00745 
00747   //void tag_strings(int argc, char **argv, FILE *out=stdout);
00748 
00754   inline void tag_sentence(mootSentence &sentence) {
00755     viterbi_clear();
00756     for (mootSentence::const_iterator si = sentence.begin();
00757          si != sentence.end();
00758          si++)
00759       {
00760         viterbi_step(*si);
00761         if (ndots && (ntokens % ndots)==0) fputc('.', stderr);
00762       }
00763     viterbi_finish();
00764     tag_mark_best(sentence);
00765     nsents++;
00766   };
00768 
00769   /*====================================================================
00770    * VITERBI: Mid-level
00771    *====================================================================*/
00774 
00775   //------------------------------------------------------------
00776   // Viterbi: Mid-level: clear
00778   void viterbi_clear(void);
00779 
00780   //------------------------------------------------------------
00781   // Viterbi: single iteration: (mootToken)
00786   inline void viterbi_step(const mootToken &token) {
00787     if (token.toktype() != TokTypeVanilla) return; //-- ignore non-vanilla tokens
00788     ntokens++;
00789     LexClass tok_class;
00790     for (mootToken::Analyses::const_iterator ani = token.analyses().begin();
00791          ani != token.analyses().end();
00792          ani++)
00793       {
00794         tok_class.insert(tagids.name2id(ani->tag));
00795       }
00796     viterbi_step(token2id(token.text()), tok_class, token.text());
00797   };
00798 
00799   //------------------------------------------------------------
00800   // Viterbi: single iteration: (TokID,LexClass=set<ClassID>)
00806   inline void viterbi_step(TokID tokid,
00807                            const LexClass &lexclass,
00808                            const mootTokString &toktext="")
00809   {
00810     if (use_lex_classes) {
00811       if (lexclass.empty()) {
00812         nunclassed++;
00813         viterbi_step(tokid, 0, uclass, toktext);
00814       } else {
00815         //-- non-empty class : get ID (assign empty distribution if unknown)
00816         ClassID classid = class2id(lexclass,0,1);
00817         viterbi_step(tokid, classid, lexclass, toktext);
00818       }
00819     } else {
00820       //-- !use_lex_classes
00821       if (lexclass.empty()) {
00822         nunclassed++;
00823         viterbi_step(tokid, toktext);
00824       } else {
00825         viterbi_step(tokid, 0, lexclass, toktext);
00826       }
00827     }
00828   };
00829 
00830   //------------------------------------------------------------
00831   // Viterbi: single iteration: (TokID,ClassID,LexClass)
00836   void viterbi_step(TokID tokid,
00837                     ClassID classid,
00838                     const LexClass &lclass,
00839                     const mootTokString &toktext="");
00840 
00841   //------------------------------------------------------------
00842   // Viterbi: single iteration: (TokID)
00849   void viterbi_step(TokID tokid, const mootTokString &toktext="");
00850 
00851   //------------------------------------------------------------
00852   // Viterbi: single iteration: (TokString)
00859   inline void viterbi_step(const mootTokString &token_text) {
00860     return viterbi_step(token2id(token_text), token_text);
00861   };
00862 
00863   //------------------------------------------------------------
00864   // Viterbi: single iteration: (TokString,set<TagString>)
00870   inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
00871   {
00872     LexClass lclass;
00873     tagset2lexclass(tags,&lclass);
00874     viterbi_step(token2id(token_text), lclass, token_text);
00875   };
00876 
00877   //------------------------------------------------------------
00878   // Viterbi: single iteration: (TokID,TagID,col=NULL)
00882   void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
00883 
00884   //------------------------------------------------------------
00885   // Viterbi: single iteration: (TokString,TagString)
00891   inline void viterbi_step(const mootTokString &toktext, const mootTagString &tag)
00892   {
00893     return viterbi_step(token2id(toktext), tagids.name2id(tag));
00894   };
00895 
00896 
00897   //------------------------------------------------------------
00898   // Viterbi: finish
00902   inline void viterbi_finish(const TagID final_tagid)
00903   {
00904     viterbi_step(0, final_tagid);
00905   };
00906 
00910   inline void viterbi_finish(void)
00911   {
00912     viterbi_step(0, start_tagid);
00913   };
00914 
00925   void tag_mark_best(mootSentence &sentence);
00927 
00928 
00929   //------------------------------------------------------------
00930   // Viterbi: Low/Mid-level: path utilities
00933 
00935   inline ViterbiPathNode *viterbi_best_path(void)
00936   {
00937     return viterbi_node_path(viterbi_best_node());
00938   };
00939 
00941   inline ViterbiPathNode *viterbi_best_path(TagID tagid)
00942   {
00943     return viterbi_node_path(viterbi_best_node(tagid));
00944   };
00945 
00947   inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
00948   {
00949     return viterbi_best_path(tagids.name2id(tagstr));
00950   };
00951 
00958   inline ViterbiNode *viterbi_best_node(void)
00959   {
00960     ViterbiNode *pnod;
00961     vbestpr = MOOT_PROB_NEG;
00962     vbestpn = NULL;
00963 
00964 #ifdef MOOT_USE_TRIGRAMS
00965     ViterbiRow  *prow;
00966     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00967       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00968         if (pnod->lprob > vbestpr) {
00969           vbestpr = pnod->lprob;
00970           vbestpn = pnod;
00971         }
00972       }
00973     }
00974 #else // !MOOT_USE_TRIGRAMS
00975     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00976       if (pnod->lprob > vbestpr) {
00977         vbestpr = pnod->lprob;
00978         vbestpn = pnod;
00979       }
00980     }
00981 #endif // MOOT_USE_TRIGRAMS
00982     return vbestpn;
00983   };
00984 
00991   inline ViterbiNode *viterbi_best_node(TagID tagid)
00992   {
00993     ViterbiNode *pnod;
00994     vbestpr = MOOT_PROB_NEG;
00995 #ifdef MOOT_USE_TRIGRAMS
00996     ViterbiRow  *prow;
00997     vbestpn = NULL;
00998     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00999       if (prow->tagid == tagid) {
01000         for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01001           if (pnod->lprob > vbestpr) {
01002             vbestpr = pnod->lprob;
01003             vbestpn = pnod;
01004           }
01005         }
01006         return vbestpn;
01007       }
01008     }
01009 #else // !MOOT_USE_TRIGRAMS
01010     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01011       if (pnod->tagid == tagid) return pnod;
01012     }
01013 #endif // MOOT_USE_TRIGRAMS
01014     return NULL;
01015   };
01016  
01017   //------------------------------------------------------------
01018   // Viterbi: Low/Mid: node-to-path conversion
01026   inline ViterbiPathNode *viterbi_node_path(ViterbiNode *node)
01027   {
01028     viterbi_clear_bestpath();
01029     ViterbiPathNode *pnod; 
01030     for ( ; node != NULL; node = node->pth_prev) {
01031       pnod            = viterbi_get_pathnode();
01032       pnod->node      = node;
01033       pnod->path_next = vbestpath;
01034       vbestpath       = pnod;
01035     }
01036     return vbestpath;
01037   };
01039 
01040   //------------------------------------------------------------
01041   // public methods: low-level: Viterbi
01042 
01044   //{@
01045 
01047   inline bool viterbi_column_ok(const ViterbiColumn *col) const {
01048     return (col
01049             && col->rows 
01050 #ifdef MOOT_USE_TRIGRAMS
01051             && col->rows->nodes
01052 #endif
01053             );
01054   };
01055 
01065   inline ViterbiColumn *viterbi_populate_row(TagID curtagid,
01066                                              ProbT wordpr=MOOT_PROB_ONE,
01067                                              ViterbiColumn *col=NULL,
01068                                              ProbT probmin=MOOT_PROB_NONE)
01069   {
01070 #ifdef MOOT_USE_TRIGRAMS
01071     ViterbiRow  *prow, *row = viterbi_get_row();
01072     ViterbiNode *pnod, *nod = NULL;
01073 
01074     if (!col) {
01075       col           = viterbi_get_column();
01076       col->rows     = NULL;
01077       col->bbestpr  = MOOT_PROB_NEG;
01078       if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01079       else        col->bpprmin = MOOT_PROB_NEG;
01080     }
01081     if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01082     col->col_prev = vtable;
01083     row->nodes = NULL;
01084     row->wprob = wordpr;
01085 
01086     for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01087       vbestpr = MOOT_PROB_NEG;
01088       vbestpn = NULL;
01089 
01090       for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01091         //-- beam pruning
01092         if (beamwd && pnod->lprob < col->bpprmin) continue;
01093 
01094         //-- probability lookup
01095         vtagpr = pnod->lprob + tagp(pnod->ptagid, prow->tagid, curtagid);
01096         if (vtagpr > vbestpr
01097 # ifdef MOOT_LEX_IS_TIEBREAKER
01098             || (vtagpr == vbestpr && wordpr > prow->wprob)
01099 # endif
01100             ) 
01101           {
01102             vbestpr = vtagpr;
01103             vbestpn = pnod;
01104           }
01105       }
01106 
01107       //-- set node information
01108       if (vbestpn != NULL) {
01109         nod = viterbi_get_node();
01110         nod->tagid    = curtagid;
01111         nod->ptagid   = prow->tagid;
01112         nod->lprob    = vbestpr + wordpr;
01113         nod->pth_prev = vbestpn;
01114         nod->nod_next = row->nodes;
01115 
01116         row->nodes    = nod;
01117 
01118         //-- save beam information
01119         if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01120       }
01121     }
01122 
01123     //-- set row information
01124     row->tagid    = curtagid;
01125     row->row_next = col->rows;
01126     col->rows     = row;
01127 
01128 #else 
01129 
01130     ViterbiNode *pnod, *nod = NULL;
01131 
01132     if (!col) {
01133       col           = viterbi_get_column();
01134       col->rows     = NULL;
01135       col->bbestpr  = MOOT_PROB_NEG;
01136       if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01137       else        col->bpprmin = MOOT_PROB_NEG;
01138     }
01139     if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01140     col->col_prev = vtable;
01141 
01142     vbestpr = MOOT_PROB_NEG;
01143     vbestpn = NULL;
01144 
01145     for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01146       //-- beam pruning
01147       if (beamwd && pnod->lprob < col->bpprmin) continue;
01148 
01149       //-- probability lookup
01150       vtagpr = pnod->lprob + tagp(pnod->tagid, curtagid);
01151       if (vtagpr > vbestpr
01152 # ifdef MOOT_LEX_IS_TIEBREAKER
01153           || (vtagpr == vbestpr && wordpr > pnod->wprob)
01154 # endif
01155           )
01156         {
01157           vbestpr = vtagpr;
01158           vbestpn = pnod;
01159         }
01160     }
01161 
01162     //-- set node/row information
01163     nod           = viterbi_get_node();
01164     nod->tagid    = curtagid;
01165     nod->wprob    = wordpr;
01166     nod->lprob    = vbestpr + wordpr;
01167     nod->pth_prev = vbestpn;
01168     nod->nod_next = col->rows;
01169 
01170     //-- set row/col information
01171     nod->nod_next = col->rows;
01172     col->rows     = nod;
01173 
01174     //-- save beam information
01175     if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01176 
01177 #endif // MOOT_USE_TRIGRAMS
01178 
01179     return col;
01180   };
01181 
01182 
01183   //------------------------------------------------------------
01184   // Viterbi: Low-level: clear best-path
01186   inline void viterbi_clear_bestpath(void)
01187   {
01188     //-- move to trash: path-nodes
01189     ViterbiPathNode *pnod, *pnod_next;
01190     for (pnod = vbestpath; pnod != NULL; pnod = pnod_next) {
01191       pnod_next       = pnod->path_next;
01192       pnod->path_next = trash_pathnodes;
01193       trash_pathnodes = pnod;
01194     }
01195     vbestpath = NULL;
01196   };
01197 
01198   //------------------------------------------------------------
01199   // Viterbi: fallback
01205   void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
01207 
01208 
01209   //------------------------------------------------------------
01214   inline ViterbiNode *viterbi_get_node(void) {
01215     ViterbiNode *nod;
01216     if (trash_nodes != NULL) {
01217       nod         = trash_nodes;
01218       trash_nodes = nod->nod_next;
01219     } else {
01220       nod = new ViterbiNode();
01221     }
01222     return nod;
01223   };
01224 
01225   //------------------------------------------------------------
01226   // Viterbi: trash utilities: Rows
01228   inline ViterbiRow *viterbi_get_row(void) {
01229 #ifdef MOOT_USE_TRIGRAMS
01230     ViterbiRow *row;
01231     if (trash_rows != NULL) {
01232       row        = trash_rows;
01233       trash_rows = row->row_next;
01234     } else {
01235       row = new ViterbiRow();
01236     }
01237     return row;
01238 #else
01239     return viterbi_get_node();
01240 #endif //MOOT_USE_TRIGRAMS
01241   };
01242 
01243   //------------------------------------------------------------
01244   // Viterbi: trash utilities: columns
01246   inline ViterbiColumn *viterbi_get_column(void) {
01247     ViterbiColumn *col;
01248     if (trash_columns != NULL) {
01249       col           = trash_columns;
01250       trash_columns = col->col_prev;
01251     } else {
01252       col = new ViterbiColumn();
01253     }
01254     return col;
01255   };
01256 
01257   //------------------------------------------------------------
01258   // Viterbi: trash utilities: path-nodes
01260   inline ViterbiPathNode *viterbi_get_pathnode(void) {
01261     ViterbiPathNode *pnod;
01262     if (trash_pathnodes != NULL) {
01263       pnod            = trash_pathnodes;
01264       trash_pathnodes = pnod->path_next;
01265     } else {
01266       pnod = new ViterbiPathNode();
01267     }
01268     return pnod;
01269   };
01271 
01272 
01273 
01274   //------------------------------------------------------------
01275   // Low-level: ID Lookup
01279   inline TokID token2id(const mootTokString &token) const
01280   {
01281 #ifdef MOOT_LEX_NONALPHA
01282     TokID tokid = tokids.name2id(token);
01283     return tokid ? tokid : flavids[tokenFlavor(token)];
01284 #else
01285     mootTokenFlavor flav = tokenFlavor(token);
01286     return flavids[flav]==0 ? tokids.name2id(token) : flavids[flav];
01287 #endif
01288   };
01289 
01300   inline LexClass *tagset2lexclass(const mootTagSet &tagset,
01301                                    LexClass *lclass=NULL,
01302                                    bool add_tagids=false)
01303   {
01304     if (!lclass) lclass = new LexClass();
01305     //-- ... for all tags in the class (utsi)
01306     for (mootTagSet::const_iterator tsi = tagset.begin();
01307          tsi != tagset.end();
01308          tsi++)
01309       {
01310         //-- lookup or assign a tag id
01311         TagID tagid = tagids.name2id(*tsi);
01312         if (add_tagids && tagid==0) tagid = tagids.insert(*tsi);
01313 
01314         //-- insert tagid into lexical class
01315         lclass->insert(tagid);
01316       }
01317     return lclass;
01318   };
01319 
01320 
01326   inline ClassID class2id(const LexClass &lclass,
01327                           bool autopopulate=true,
01328                           bool autocreate=true)
01329   {
01330     ClassID cid = classids.name2id(lclass);
01331     if (cid == 0) {
01332       nnewclasses++;
01333       if (!autopopulate && !autocreate) return cid;  //-- map unknown classes to zero
01334 
01335       //-- previously unknown class: fill 'er up with default values
01336       cid = classids.insert(lclass);
01337       if (cid >= lcprobs.size()) {
01338         n_classes = cid+1;
01339 
01340         //-- resize() should really happen 2 lines down,
01341         //   but that might break something : test this at some point!
01342         lcprobs.resize(n_classes);
01343       }
01344       if (autopopulate) {
01345         LexClassProbSubTable &lcps = lcprobs[cid];
01346         if (!lclass.empty()) {
01347           //-- non-empty class: restrict population to class-members
01348           ProbT lcprob = log(1.0/((ProbT)lclass.size()));
01349 
01350           for (LexClass::const_iterator lci = lclass.begin(); lci != lclass.end(); lci++) {
01351             lcps[*lci] = lcprob;
01352           }
01353         } else {
01354           //-- empty class: use class for "unknown" token instead [HACK!]
01355           const LexProbSubTable &lps = lexprobs[0];
01356           ProbT lpprob = log(1.0/((ProbT)lps.size()));
01357 
01358           for (LexProbSubTable::const_iterator lpsi = lps.begin(); lpsi != lps.end(); lpsi++) {
01359             lcps[lpsi->key()] = lpprob;
01360           }
01361         }
01362       }
01363     }
01364     return cid;
01365   };
01367 
01368 
01369   //------------------------------------------------------------
01372 
01373   /*------------------------------------------------------------
01374    * Lexical Probability Lookup
01375    */
01377   /*
01378   inline void get_wordp_maps(TokID   tokid,
01379                              ClassID classid,
01380                              const mootTokString &toktext,
01381                              LexProbSubTable* const* primary,
01382                              LexProbSubTable* const* secondary)
01383     const
01384   {
01385     if (tokid != 0 || !use_lex_classes) {
01386       if (primary)   *primary   = &lexprobs[tokid];
01387       if (secondary) *secondary = NULL;
01388     }
01389     else if (use_lex_classes) {
01390       if (primary)   *primary   = &lcprobs[classid];
01391       if (secondary) {
01392         size_t matchlen;
01393         *secondary = &(suftrie.sufprobs(toktext,&matchlen));
01394         if (matchlen == 0) *secondary = NULL;
01395       }
01396     }
01397     else { //-- tokid==0 && !use_lex_classes
01398       if (primary) {
01399         size_t matchlen;
01400         *primary = &(suftrie.sufprobs(toktext,&matchlen));
01401         if (matchlen == 0) {
01402           *primary = &lexprobs[tokid];
01403           *secondary = NULL;
01404         }
01405         else if (secondary) {
01406           *secondary = &lexprobs[tokid];
01407         }
01408       }
01409     }
01410   };
01411   */
01412 
01413 
01418   inline const ProbT wordp(const TokID tokid, const TagID tagid) const
01419   {
01420     if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
01421     const LexProbSubTable &lps = lexprobs[tokid];
01422     LexProbSubTable::const_iterator lpsi = lps.find(tagid);
01423     return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01424   };
01425 
01432   inline const ProbT wordp(const mootTokString token, const mootTagString tag) const
01433   {
01434     return wordp(token2id(token), tagids.name2id(tag));
01435   };
01436 
01437   /*------------------------------------------------------------
01438    * Lexical-Class Probability Lookup
01439    */
01443   inline const ProbT classp(const ClassID classid, const TagID tagid) const
01444   {
01445     if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
01446     const LexClassProbSubTable &lps = lcprobs[classid];
01447     LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
01448     return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01449   };
01450 
01457   inline const ProbT classp(const LexClass &lclass, const mootTagString tag) const
01458   {
01459     return classp(classids.name2id(lclass), tagids.name2id(tag));
01460   };
01461 
01462   /*------------------------------------------------------------
01463    * Unigram Probability Lookup
01464    */
01468   inline const ProbT tagp(const TagID tagid) const
01469   {
01470     return
01471 #ifdef MOOT_USE_TRIGRAMS
01472       tagp(0,0,tagid);
01473 #else
01474       ngprobs2 && tagid < n_tags
01475       ? ngprobs2[tagid]
01476       : MOOT_PROB_ZERO;
01477 #endif // MOOT_USE_TRIGRAMS
01478   };
01479 
01485   inline const ProbT tagp(const mootTagString &tag) const
01486   {
01487     return tagp(tagids.name2id(tag));
01488   };
01489 
01490   /*------------------------------------------------------------
01491    * Bigram Probability Lookup
01492    */
01497   inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
01498   {
01499     return
01500 #ifdef MOOT_USE_TRIGRAMS
01501       tagp(0,prevtagid,tagid);
01502 #else
01503       ngprobs2 && prevtagid < n_tags && tagid < n_tags
01504       ? ngprobs2[(n_tags*prevtagid)+tagid]
01505       : MOOT_PROB_ZERO;
01506 #endif
01507   };
01508 
01514   inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
01515   {
01516     return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
01517   };
01518 
01519   /*------------------------------------------------------------
01520    * Trigram probability lookup
01521    */
01522 #ifdef MOOT_USE_TRIGRAMS
01523 
01529 #ifdef MOOT_HASH_TRIGRAMS
01530   inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
01531   {
01532     TrigramProbTable::const_iterator tgti = ngprobs3.find(trigram);
01533     return tgti != ngprobs3.end() ? tgti->second : ProbZero;
01534   };
01535 #endif //MOOT_HASH_TRIGRAMS
01536 
01543   inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
01544   {
01545     return
01546 #ifdef MOOT_HASH_TRIGRAMS
01547       tagp(Trigram(prevtagid2,prevtagid1,tagid))
01548 #else
01549       ngprobs3 && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
01550       ? ngprobs3[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
01551       : MOOT_PROB_ZERO;
01552 #endif
01553       ;
01554   };
01555 
01563   inline const ProbT tagp(const mootTagString &prevtag2,
01564                           const mootTagString &prevtag1,
01565                           const mootTagString &tag)
01566     const
01567   {
01568     return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
01569   };
01570 #endif // MOOT_USE_TRIGRAMS
01571 
01572 
01573 
01574   //------------------------------------------------------------
01575   // Error Reporting
01576 
01580   void carp(char *fmt, ...);
01582 
01583   //------------------------------------------------------------
01584   // public methods: low-level: debugging
01585 
01589   void txtdump(FILE *file);
01590 
01592   void viterbi_txtdump(TokenWriter *w, int ncols=0);
01593 
01595   void viterbi_txtdump_col(TokenWriter *w, ViterbiColumn *col, int colnum=0);
01597 };
01598 
01599 moot_END_NAMESPACE
01600 
01601 #endif /* _MOOT_HMM_H */

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913