00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _MOOT_HMM_H
00030 #define _MOOT_HMM_H
00031
00032 #ifdef HAVE_FLOAT_H
00033 # include <float.h>
00034 #endif
00035
00036 #include <string.h>
00037 #include <ctype.h>
00038 #include <math.h>
00039
00040 #include <mootTypes.h>
00041 #include <mootIO.h>
00042 #include <mootZIO.h>
00043 #include <mootToken.h>
00044 #include <mootTokenIO.h>
00045 #include <mootLexfreqs.h>
00046 #include <mootClassfreqs.h>
00047 #include <mootNgrams.h>
00048 #include <mootEnum.h>
00049 #include <mootAssocVector.h>
00050 #include <mootSuffixTrie.h>
00051
00083 #define MOOT_LEX_UNKNOWN_TOKENS
00084
00085
00092 #define MOOT_LEX_UNKNOWN_CLASSES
00093
00094
00101 #define MOOT_LEX_NONALPHA
00102
00103
00108 #undef MOOT_LEX_IS_TIEBREAKER
00109
00110 moot_BEGIN_NAMESPACE
00111
00112
00113
00114
00115
00122 class mootHMM {
00123 public:
00124
00127
00129 typedef enum {
00130 vlSilent,
00131 vlErrors,
00132 vlWarnings,
00133 vlProgress,
00134 vlEverything
00135 } VerbosityLevel;
00136
00137
00139 typedef mootEnumID TagID;
00140
00142 typedef mootEnumID TokID;
00143
00148 typedef mootEnumID ClassID;
00150
00151
00152
00153
00155
00156
00161 typedef set<TagID> LexClass;
00162
00164 struct LexClassHash {
00165 public:
00166 inline size_t operator()(const LexClass &x) const {
00167 size_t hv = 0;
00168 for (LexClass::const_iterator xi = x.begin(); xi != x.end(); xi++) {
00169 hv = 5*hv + *xi;
00170 }
00171 return hv;
00172 };
00173 };
00175 struct LexClassEqual {
00176 public:
00177 inline size_t operator()(const LexClass &x, const LexClass &y) const {
00178 return x==y;
00179 };
00180 };
00182
00183
00186
00188 typedef mootEnum<mootTagString,
00189 hash<mootTagString>,
00190 equal_to<mootTagString> >
00191 TagIDTable;
00192
00194 typedef mootEnum<mootTokString,
00195 hash<mootTokString>,
00196 equal_to<mootTokString> >
00197 TokIDTable;
00198
00200 typedef mootEnum<LexClass,
00201 LexClassHash,
00202 LexClassEqual>
00203 ClassIDTable;
00204
00206
00207 typedef AssocVector<TagID,ProbT> LexProbSubTable;
00208
00213 typedef LexProbSubTable LexClassProbSubTable;
00214
00218 typedef vector<LexProbSubTable> LexProbTable;
00219
00234 typedef LexProbTable LexClassProbTable;
00235
00247 typedef ProbT *BigramProbTable;
00248
00249 #if defined(MOOT_USE_TRIGRAMS)
00250 # if defined(MOOT_HASH_TRIGRAMS)
00251
00252 class Trigram {
00253 public:
00254
00256 struct HashFcn {
00257 public:
00258 inline size_t operator()(const Trigram &x) const
00259 {
00260 return
00261 (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
00262 };
00263 };
00264
00266 struct EqualFcn {
00267 public:
00268 inline size_t operator()(const Trigram &x, const Trigram &y) const
00269 {
00270 return
00271 x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
00272
00273 };
00274 };
00275
00276 public:
00277 TagID tag1;
00278 TagID tag2;
00279 TagID tag3;
00280
00281 public:
00283 Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
00284 : tag1(t1), tag2(t2), tag3(t3)
00285 {};
00286
00288 ~Trigram(void) {};
00289 };
00290
00293 typedef
00294 hash_map<Trigram,ProbT,
00295 Trigram::HashFcn,
00296 Trigram::EqualFcn>
00297 TrigramProbTable;
00298
00299 # else
00300
00301
00319 typedef ProbT* TrigramProbTable;
00320 # endif // MOOT_HASH_TRIGRAMS
00321
00322 #endif // MOOT_USE_TRIGRAMS
00323
00324
00325
00328
00336 class ViterbiNode {
00337 public:
00338 TagID tagid;
00339 #ifdef MOOT_USE_TRIGRAMS
00340 TagID ptagid;
00341 #else
00342 ProbT wprob;
00343 #endif
00344 ProbT lprob;
00345
00346 class ViterbiNode *pth_prev;
00347 class ViterbiNode *nod_next;
00348 };
00349
00350 #ifdef MOOT_USE_TRIGRAMS
00351
00356 class ViterbiRow {
00357 public:
00358 TagID tagid;
00359 ProbT wprob;
00360 class ViterbiNode *nodes;
00361 class ViterbiRow *row_next;
00362 };
00363 #else
00364 typedef ViterbiNode ViterbiRow;
00365 #endif
00366
00367
00374 class ViterbiColumn {
00375 public:
00376 ViterbiRow *rows;
00377 ViterbiColumn *col_prev;
00378 ProbT bbestpr;
00379 ProbT bpprmin;
00380 };
00381
00398 struct ViterbiPathNode {
00399 public:
00400 ViterbiNode *node;
00401 ViterbiPathNode *path_next;
00402 };
00404
00405
00406 public:
00407
00414 int verbose;
00415
00420 size_t ndots;
00421
00425 bool save_ambiguities;
00426
00430 bool save_flavors;
00431
00435 bool save_mark_unknown;
00436
00440 bool save_dump_trellis;
00442
00443
00446
00454 bool use_lex_classes;
00455
00462 TagID start_tagid;
00463
00472 ProbT unknown_lex_threshhold;
00473
00482 ProbT unknown_class_threshhold;
00483
00489 LexClass uclass;
00491
00492
00495 ProbT nglambda1;
00496 ProbT nglambda2;
00497 #ifdef MOOT_USE_TRIGRAMS
00498 ProbT nglambda3;
00499 #endif
00500 ProbT wlambda0;
00501 ProbT wlambda1;
00503 ProbT clambda0;
00504 ProbT clambda1;
00511 ProbT beamwd;
00513
00514
00517 TokIDTable tokids;
00518 TagIDTable tagids;
00519 ClassIDTable classids;
00521
00522 TokID flavids[NTokFlavors];
00524
00525
00528 size_t n_tags;
00529 size_t n_toks;
00530 size_t n_classes;
00532 LexProbTable lexprobs;
00533 LexClassProbTable lcprobs;
00534 #ifdef MOOT_USE_TRIGRAMS
00535 TrigramProbTable ngprobs3;
00536 #else
00537 BigramProbTable ngprobs2;
00538 #endif
00539
00540 SuffixTrie suftrie;
00542
00543
00546 ViterbiColumn *vtable;
00548
00549
00552 size_t nsents;
00553 size_t ntokens;
00554 size_t nnewtokens;
00555 size_t nunclassed;
00556 size_t nnewclasses;
00557 size_t nunknown;
00558 size_t nfallbacks;
00560
00561 protected:
00562
00565 ViterbiNode *trash_nodes;
00566 #ifdef MOOT_USE_TRIGRAMS
00567 ViterbiRow *trash_rows;
00568 #endif
00569 ViterbiColumn *trash_columns;
00570 ViterbiPathNode *trash_pathnodes;
00572
00573
00576 TagID vtagid;
00577 ProbT vbestpr;
00578 ProbT vtagpr;
00579 ProbT vwordpr;
00580 ViterbiNode *vbestpn;
00582 ViterbiPathNode *vbestpath;
00584
00585
00587
00588 public:
00589
00593 mootHMM(void);
00594
00596 ~mootHMM(void) { clear(true,false); };
00598
00599
00607 void clear(bool wipe_everything=true, bool unlogify=false);
00609
00610
00614 bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
00615
00617 bool save(mootio::mostream *obs, const char *filename=NULL);
00618
00620 bool _bindump(mootio::mostream *obs, const char *filename=NULL);
00621
00623 bool load(const char *filename=NULL);
00624
00626 bool load(mootio::mistream *ibs, const char *filename=NULL);
00627
00629 bool _binload(mootio::mistream *ibs, const char *filename=NULL);
00631
00632
00636 inline void unknown_token_name(const mootTokString &name)
00637 {
00638 tokids.unknown_name(name);
00639 };
00640
00642 inline void unknown_tag_name(const mootTokString &name)
00643 {
00644 tagids.unknown_name(name);
00645 };
00646
00647
00648
00649
00650
00651 inline void unknown_class_name(const mootTagSet &tagset)
00652 {
00653 tagset2lexclass(tagset,&uclass,false);
00654 };
00656
00657
00658
00674 bool load_model(const string &modelname,
00675 const mootTagString &start_tag_str="__$",
00676 const char *myname="mootHMM::load_model()",
00677 bool do_estimate_nglambdas=true,
00678 bool do_estimate_wlambdas=true,
00679 bool do_estimate_clambdas=true,
00680 bool do_build_suffix_trie=true,
00681 bool do_compute_logprobs=true);
00682
00688 bool compile(const mootLexfreqs &lexfreqs,
00689 const mootNgrams &ngrams,
00690 const mootClassfreqs &classfreqs,
00691 const mootTagString &start_tag_str="__$");
00692
00694 void assign_ids_lf(const mootLexfreqs &lexfreqs);
00695
00697 void assign_ids_ng(const mootNgrams &ngrams);
00698
00700 void assign_ids_cf(const mootClassfreqs &classfreqs);
00701
00703 void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
00704
00706 bool estimate_lambdas(const mootNgrams &ngrams);
00707
00709 bool estimate_wlambdas(const mootLexfreqs &lf);
00710
00712 bool estimate_clambdas(const mootClassfreqs &cf);
00713
00715 bool build_suffix_trie(const mootLexfreqs &lf,
00716 const mootNgrams &ng,
00717 bool verbose=false)
00718 { return suftrie.build(lf,ng,tagids,start_tagid,verbose); };
00719
00721 bool compute_logprobs(void);
00723
00724
00725
00728
00730 void tag_io(TokenReader *reader, TokenWriter *writer)
00731 {
00732 int rtok;
00733 mootSentence *sent;
00734 while (reader && (rtok = reader->get_sentence()) != TokTypeEOF) {
00735 sent = reader->sentence();
00736 if (!sent) continue;
00737 tag_sentence(*sent);
00738
00739 #ifdef MOOT_DEBUG_ENABLED
00740 if (save_dump_trellis) viterbi_txtdump(writer, sent->size()+1);
00741 #endif
00742
00743 if (writer) writer->put_sentence(*sent);
00744 }
00745 };
00746
00748
00749
00755 inline void tag_sentence(mootSentence &sentence) {
00756 viterbi_clear();
00757 for (mootSentence::const_iterator si = sentence.begin();
00758 si != sentence.end();
00759 si++)
00760 {
00761 viterbi_step(*si);
00762 if (ndots && (ntokens % ndots)==0) fputc('.', stderr);
00763 }
00764 viterbi_finish();
00765 tag_mark_best(sentence);
00766 nsents++;
00767 };
00769
00770
00771
00772
00775
00776
00777
00779 void viterbi_clear(void);
00780
00781
00782
00787 inline void viterbi_step(const mootToken &token) {
00788 if (token.toktype() != TokTypeVanilla) return;
00789 ntokens++;
00790 LexClass tok_class;
00791 for (mootToken::Analyses::const_iterator ani = token.analyses().begin();
00792 ani != token.analyses().end();
00793 ani++)
00794 {
00795 tok_class.insert(tagids.name2id(ani->tag));
00796 }
00797 viterbi_step(token2id(token.text()), tok_class, token.text());
00798 };
00799
00800
00801
00807 inline void viterbi_step(TokID tokid,
00808 const LexClass &lexclass,
00809 const mootTokString &toktext="")
00810 {
00811 if (use_lex_classes) {
00812 if (lexclass.empty()) {
00813 nunclassed++;
00814 viterbi_step(tokid, 0, uclass, toktext);
00815 } else {
00816
00817 ClassID classid = class2id(lexclass,0,1);
00818 viterbi_step(tokid, classid, lexclass, toktext);
00819 }
00820 } else {
00821
00822 if (lexclass.empty()) {
00823 nunclassed++;
00824 viterbi_step(tokid, toktext);
00825 } else {
00826 viterbi_step(tokid, 0, lexclass, toktext);
00827 }
00828 }
00829 };
00830
00831
00832
00837 void viterbi_step(TokID tokid,
00838 ClassID classid,
00839 const LexClass &lclass,
00840 const mootTokString &toktext="");
00841
00842
00843
00850 void viterbi_step(TokID tokid, const mootTokString &toktext="");
00851
00852
00853
00860 inline void viterbi_step(const mootTokString &token_text) {
00861 return viterbi_step(token2id(token_text), token_text);
00862 };
00863
00864
00865
00871 inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
00872 {
00873 LexClass lclass;
00874 tagset2lexclass(tags,&lclass);
00875 viterbi_step(token2id(token_text), lclass, token_text);
00876 };
00877
00878
00879
00883 void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
00884
00885
00886
00892 inline void viterbi_step(const mootTokString &toktext, const mootTagString &tag)
00893 {
00894 return viterbi_step(token2id(toktext), tagids.name2id(tag));
00895 };
00896
00897
00898
00899
00903 inline void viterbi_finish(const TagID final_tagid)
00904 {
00905 viterbi_step(0, final_tagid);
00906 };
00907
00911 inline void viterbi_finish(void)
00912 {
00913 viterbi_step(0, start_tagid);
00914 };
00915
00926 void tag_mark_best(mootSentence &sentence);
00928
00929
00930
00931
00934
00936 inline ViterbiPathNode *viterbi_best_path(void)
00937 {
00938 return viterbi_node_path(viterbi_best_node());
00939 };
00940
00942 inline ViterbiPathNode *viterbi_best_path(TagID tagid)
00943 {
00944 return viterbi_node_path(viterbi_best_node(tagid));
00945 };
00946
00948 inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
00949 {
00950 return viterbi_best_path(tagids.name2id(tagstr));
00951 };
00952
00959 inline ViterbiNode *viterbi_best_node(void)
00960 {
00961 ViterbiNode *pnod;
00962 vbestpr = MOOT_PROB_NEG;
00963 vbestpn = NULL;
00964
00965 #ifdef MOOT_USE_TRIGRAMS
00966 ViterbiRow *prow;
00967 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00968 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00969 if (pnod->lprob > vbestpr) {
00970 vbestpr = pnod->lprob;
00971 vbestpn = pnod;
00972 }
00973 }
00974 }
00975 #else // !MOOT_USE_TRIGRAMS
00976 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00977 if (pnod->lprob > vbestpr) {
00978 vbestpr = pnod->lprob;
00979 vbestpn = pnod;
00980 }
00981 }
00982 #endif // MOOT_USE_TRIGRAMS
00983 return vbestpn;
00984 };
00985
00992 inline ViterbiNode *viterbi_best_node(TagID tagid)
00993 {
00994 ViterbiNode *pnod;
00995 vbestpr = MOOT_PROB_NEG;
00996 #ifdef MOOT_USE_TRIGRAMS
00997 ViterbiRow *prow;
00998 vbestpn = NULL;
00999 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01000 if (prow->tagid == tagid) {
01001 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01002 if (pnod->lprob > vbestpr) {
01003 vbestpr = pnod->lprob;
01004 vbestpn = pnod;
01005 }
01006 }
01007 return vbestpn;
01008 }
01009 }
01010 #else // !MOOT_USE_TRIGRAMS
01011 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01012 if (pnod->tagid == tagid) return pnod;
01013 }
01014 #endif // MOOT_USE_TRIGRAMS
01015 return NULL;
01016 };
01017
01018
01019
01027 inline ViterbiPathNode *viterbi_node_path(ViterbiNode *node)
01028 {
01029 viterbi_clear_bestpath();
01030 ViterbiPathNode *pnod;
01031 for ( ; node != NULL; node = node->pth_prev) {
01032 pnod = viterbi_get_pathnode();
01033 pnod->node = node;
01034 pnod->path_next = vbestpath;
01035 vbestpath = pnod;
01036 }
01037 return vbestpath;
01038 };
01040
01041
01042
01043
01045
01046
01048 inline bool viterbi_column_ok(const ViterbiColumn *col) const {
01049 return (col
01050 && col->rows
01051 #ifdef MOOT_USE_TRIGRAMS
01052 && col->rows->nodes
01053 #endif
01054 );
01055 };
01056
01066 inline ViterbiColumn *viterbi_populate_row(TagID curtagid,
01067 ProbT wordpr=MOOT_PROB_ONE,
01068 ViterbiColumn *col=NULL,
01069 ProbT probmin=MOOT_PROB_NONE)
01070 {
01071 #ifdef MOOT_USE_TRIGRAMS
01072 ViterbiRow *prow, *row = viterbi_get_row();
01073 ViterbiNode *pnod, *nod = NULL;
01074
01075 if (!col) {
01076 col = viterbi_get_column();
01077 col->rows = NULL;
01078 col->bbestpr = MOOT_PROB_NEG;
01079 if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01080 else col->bpprmin = MOOT_PROB_NEG;
01081 }
01082 if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01083 col->col_prev = vtable;
01084 row->nodes = NULL;
01085 row->wprob = wordpr;
01086
01087 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01088 vbestpr = MOOT_PROB_NEG;
01089 vbestpn = NULL;
01090
01091 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01092
01093 if (beamwd && pnod->lprob < col->bpprmin) continue;
01094
01095
01096 vtagpr = pnod->lprob + tagp(pnod->ptagid, prow->tagid, curtagid);
01097 if (vtagpr > vbestpr
01098 # ifdef MOOT_LEX_IS_TIEBREAKER
01099 || (vtagpr == vbestpr && wordpr > prow->wprob)
01100 # endif
01101 )
01102 {
01103 vbestpr = vtagpr;
01104 vbestpn = pnod;
01105 }
01106 }
01107
01108
01109 if (vbestpn != NULL) {
01110 nod = viterbi_get_node();
01111 nod->tagid = curtagid;
01112 nod->ptagid = prow->tagid;
01113 nod->lprob = vbestpr + wordpr;
01114 nod->pth_prev = vbestpn;
01115 nod->nod_next = row->nodes;
01116
01117 row->nodes = nod;
01118
01119
01120 if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01121 }
01122 }
01123
01124
01125 row->tagid = curtagid;
01126 row->row_next = col->rows;
01127 col->rows = row;
01128
01129 #else
01130
01131 ViterbiNode *pnod, *nod = NULL;
01132
01133 if (!col) {
01134 col = viterbi_get_column();
01135 col->rows = NULL;
01136 col->bbestpr = MOOT_PROB_NEG;
01137 if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01138 else col->bpprmin = MOOT_PROB_NEG;
01139 }
01140 if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01141 col->col_prev = vtable;
01142
01143 vbestpr = MOOT_PROB_NEG;
01144 vbestpn = NULL;
01145
01146 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01147
01148 if (beamwd && pnod->lprob < col->bpprmin) continue;
01149
01150
01151 vtagpr = pnod->lprob + tagp(pnod->tagid, curtagid);
01152 if (vtagpr > vbestpr
01153 # ifdef MOOT_LEX_IS_TIEBREAKER
01154 || (vtagpr == vbestpr && wordpr > pnod->wprob)
01155 # endif
01156 )
01157 {
01158 vbestpr = vtagpr;
01159 vbestpn = pnod;
01160 }
01161 }
01162
01163
01164 nod = viterbi_get_node();
01165 nod->tagid = curtagid;
01166 nod->wprob = wordpr;
01167 nod->lprob = vbestpr + wordpr;
01168 nod->pth_prev = vbestpn;
01169 nod->nod_next = col->rows;
01170
01171
01172 nod->nod_next = col->rows;
01173 col->rows = nod;
01174
01175
01176 if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01177
01178 #endif // MOOT_USE_TRIGRAMS
01179
01180 return col;
01181 };
01182
01183
01184
01185
01187 inline void viterbi_clear_bestpath(void)
01188 {
01189
01190 ViterbiPathNode *pnod, *pnod_next;
01191 for (pnod = vbestpath; pnod != NULL; pnod = pnod_next) {
01192 pnod_next = pnod->path_next;
01193 pnod->path_next = trash_pathnodes;
01194 trash_pathnodes = pnod;
01195 }
01196 vbestpath = NULL;
01197 };
01198
01199
01200
01206 void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
01208
01209
01210
01215 inline ViterbiNode *viterbi_get_node(void) {
01216 ViterbiNode *nod;
01217 if (trash_nodes != NULL) {
01218 nod = trash_nodes;
01219 trash_nodes = nod->nod_next;
01220 } else {
01221 nod = new ViterbiNode();
01222 }
01223 return nod;
01224 };
01225
01226
01227
01229 inline ViterbiRow *viterbi_get_row(void) {
01230 #ifdef MOOT_USE_TRIGRAMS
01231 ViterbiRow *row;
01232 if (trash_rows != NULL) {
01233 row = trash_rows;
01234 trash_rows = row->row_next;
01235 } else {
01236 row = new ViterbiRow();
01237 }
01238 return row;
01239 #else
01240 return viterbi_get_node();
01241 #endif //MOOT_USE_TRIGRAMS
01242 };
01243
01244
01245
01247 inline ViterbiColumn *viterbi_get_column(void) {
01248 ViterbiColumn *col;
01249 if (trash_columns != NULL) {
01250 col = trash_columns;
01251 trash_columns = col->col_prev;
01252 } else {
01253 col = new ViterbiColumn();
01254 }
01255 return col;
01256 };
01257
01258
01259
01261 inline ViterbiPathNode *viterbi_get_pathnode(void) {
01262 ViterbiPathNode *pnod;
01263 if (trash_pathnodes != NULL) {
01264 pnod = trash_pathnodes;
01265 trash_pathnodes = pnod->path_next;
01266 } else {
01267 pnod = new ViterbiPathNode();
01268 }
01269 return pnod;
01270 };
01272
01273
01274
01275
01276
01280 inline TokID token2id(const mootTokString &token) const
01281 {
01282 #ifdef MOOT_LEX_NONALPHA
01283 TokID tokid = tokids.name2id(token);
01284 return tokid ? tokid : flavids[tokenFlavor(token)];
01285 #else
01286 mootTokenFlavor flav = tokenFlavor(token);
01287 return flavids[flav]==0 ? tokids.name2id(token) : flavids[flav];
01288 #endif
01289 };
01290
01301 inline LexClass *tagset2lexclass(const mootTagSet &tagset,
01302 LexClass *lclass=NULL,
01303 bool add_tagids=false)
01304 {
01305 if (!lclass) lclass = new LexClass();
01306
01307 for (mootTagSet::const_iterator tsi = tagset.begin();
01308 tsi != tagset.end();
01309 tsi++)
01310 {
01311
01312 TagID tagid = tagids.name2id(*tsi);
01313 if (add_tagids && tagid==0) tagid = tagids.insert(*tsi);
01314
01315
01316 lclass->insert(tagid);
01317 }
01318 return lclass;
01319 };
01320
01321
01327 inline ClassID class2id(const LexClass &lclass,
01328 bool autopopulate=true,
01329 bool autocreate=true)
01330 {
01331 ClassID cid = classids.name2id(lclass);
01332 if (cid == 0) {
01333 nnewclasses++;
01334 if (!autopopulate && !autocreate) return cid;
01335
01336
01337 cid = classids.insert(lclass);
01338 if (cid >= lcprobs.size()) {
01339 n_classes = cid+1;
01340
01341
01342
01343 lcprobs.resize(n_classes);
01344 }
01345 if (autopopulate) {
01346 LexClassProbSubTable &lcps = lcprobs[cid];
01347 if (!lclass.empty()) {
01348
01349 ProbT lcprob = log(1.0/((ProbT)lclass.size()));
01350
01351 for (LexClass::const_iterator lci = lclass.begin(); lci != lclass.end(); lci++) {
01352 lcps[*lci] = lcprob;
01353 }
01354 } else {
01355
01356 const LexProbSubTable &lps = lexprobs[0];
01357 ProbT lpprob = log(1.0/((ProbT)lps.size()));
01358
01359 for (LexProbSubTable::const_iterator lpsi = lps.begin(); lpsi != lps.end(); lpsi++) {
01360 lcps[lpsi->key()] = lpprob;
01361 }
01362 }
01363 }
01364 }
01365 return cid;
01366 };
01368
01369
01370
01373
01374
01375
01376
01378
01379
01380
01381
01382
01383
01384
01385
01386
01387
01388
01389
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411
01412
01413
01414
01419 inline const ProbT wordp(const TokID tokid, const TagID tagid) const
01420 {
01421 if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
01422 const LexProbSubTable &lps = lexprobs[tokid];
01423 LexProbSubTable::const_iterator lpsi = lps.find(tagid);
01424 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01425 };
01426
01433 inline const ProbT wordp(const mootTokString token, const mootTagString tag) const
01434 {
01435 return wordp(token2id(token), tagids.name2id(tag));
01436 };
01437
01438
01439
01440
01444 inline const ProbT classp(const ClassID classid, const TagID tagid) const
01445 {
01446 if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
01447 const LexClassProbSubTable &lps = lcprobs[classid];
01448 LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
01449 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01450 };
01451
01458 inline const ProbT classp(const LexClass &lclass, const mootTagString tag) const
01459 {
01460 return classp(classids.name2id(lclass), tagids.name2id(tag));
01461 };
01462
01463
01464
01465
01469 inline const ProbT tagp(const TagID tagid) const
01470 {
01471 return
01472 #ifdef MOOT_USE_TRIGRAMS
01473 tagp(0,0,tagid);
01474 #else
01475 ngprobs2 && tagid < n_tags
01476 ? ngprobs2[tagid]
01477 : MOOT_PROB_ZERO;
01478 #endif // MOOT_USE_TRIGRAMS
01479 };
01480
01486 inline const ProbT tagp(const mootTagString &tag) const
01487 {
01488 return tagp(tagids.name2id(tag));
01489 };
01490
01491
01492
01493
01498 inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
01499 {
01500 return
01501 #ifdef MOOT_USE_TRIGRAMS
01502 tagp(0,prevtagid,tagid);
01503 #else
01504 ngprobs2 && prevtagid < n_tags && tagid < n_tags
01505 ? ngprobs2[(n_tags*prevtagid)+tagid]
01506 : MOOT_PROB_ZERO;
01507 #endif
01508 };
01509
01515 inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
01516 {
01517 return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
01518 };
01519
01520
01521
01522
01523 #ifdef MOOT_USE_TRIGRAMS
01524
01530 #ifdef MOOT_HASH_TRIGRAMS
01531 inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
01532 {
01533 TrigramProbTable::const_iterator tgti = ngprobs3.find(trigram);
01534 return tgti != ngprobs3.end() ? tgti->second : ProbZero;
01535 };
01536 #endif //MOOT_HASH_TRIGRAMS
01537
01544 inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
01545 {
01546 return
01547 #ifdef MOOT_HASH_TRIGRAMS
01548 tagp(Trigram(prevtagid2,prevtagid1,tagid))
01549 #else
01550 ngprobs3 && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
01551 ? ngprobs3[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
01552 : MOOT_PROB_ZERO;
01553 #endif
01554 ;
01555 };
01556
01564 inline const ProbT tagp(const mootTagString &prevtag2,
01565 const mootTagString &prevtag1,
01566 const mootTagString &tag)
01567 const
01568 {
01569 return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
01570 };
01571 #endif // MOOT_USE_TRIGRAMS
01572
01573
01574
01575
01576
01577
01581 void carp(char *fmt, ...);
01583
01584
01585
01586
01590 void txtdump(FILE *file);
01591
01593 void viterbi_txtdump(TokenWriter *w, int ncols=0);
01594
01596 void viterbi_txtdump_col(TokenWriter *w, ViterbiColumn *col, int colnum=0);
01598 };
01599
01600 moot_END_NAMESPACE
01601
01602 #endif