00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _MOOT_HMM_H
00030 #define _MOOT_HMM_H
00031
00032 #ifdef __GNUC__
00033 # include <float.h>
00034 #endif // __GNUC__
00035
00036 #include <string.h>
00037 #include <ctype.h>
00038
00039 #include <mootTypes.h>
00040 #include <mootIO.h>
00041 #include <mootZIO.h>
00042 #include <mootToken.h>
00043 #include <mootTokenIO.h>
00044 #include <mootLexfreqs.h>
00045 #include <mootClassfreqs.h>
00046 #include <mootNgrams.h>
00047 #include <mootEnum.h>
00048 #include <mootAssocVector.h>
00049 #include <mootSuffixTrie.h>
00050
00082 #define MOOT_LEX_UNKNOWN_TOKENS
00083
00084
00091 #define MOOT_LEX_UNKNOWN_CLASSES
00092
00093
00100 #define MOOT_LEX_NONALPHA
00101
00102
00107 #undef MOOT_LEX_IS_TIEBREAKER
00108
00109 moot_BEGIN_NAMESPACE
00110
00111
00112
00113
00114
00121 class mootHMM {
00122 public:
00123
00126
00128 typedef enum {
00129 vlSilent,
00130 vlErrors,
00131 vlWarnings,
00132 vlProgress,
00133 vlEverything
00134 } VerbosityLevel;
00135
00136
00138 typedef mootEnumID TagID;
00139
00141 typedef mootEnumID TokID;
00142
00147 typedef mootEnumID ClassID;
00149
00150
00151
00152
00154
00155
00160 typedef set<TagID> LexClass;
00161
00163 struct LexClassHash {
00164 public:
00165 inline size_t operator()(const LexClass &x) const {
00166 size_t hv = 0;
00167 for (LexClass::const_iterator xi = x.begin(); xi != x.end(); xi++) {
00168 hv = 5*hv + *xi;
00169 }
00170 return hv;
00171 };
00172 };
00174 struct LexClassEqual {
00175 public:
00176 inline size_t operator()(const LexClass &x, const LexClass &y) const {
00177 return x==y;
00178 };
00179 };
00181
00182
00185
00187 typedef mootEnum<mootTagString,
00188 hash<mootTagString>,
00189 equal_to<mootTagString> >
00190 TagIDTable;
00191
00193 typedef mootEnum<mootTokString,
00194 hash<mootTokString>,
00195 equal_to<mootTokString> >
00196 TokIDTable;
00197
00199 typedef mootEnum<LexClass,
00200 LexClassHash,
00201 LexClassEqual>
00202 ClassIDTable;
00203
00205
00206 typedef AssocVector<TagID,ProbT> LexProbSubTable;
00207
00212 typedef LexProbSubTable LexClassProbSubTable;
00213
00217 typedef vector<LexProbSubTable> LexProbTable;
00218
00233 typedef LexProbTable LexClassProbTable;
00234
00246 typedef ProbT *BigramProbTable;
00247
00248 #if defined(MOOT_USE_TRIGRAMS)
00249 # if defined(MOOT_HASH_TRIGRAMS)
00250
00251 class Trigram {
00252 public:
00253
00255 struct HashFcn {
00256 public:
00257 inline size_t operator()(const Trigram &x) const
00258 {
00259 return
00260 (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
00261 };
00262 };
00263
00265 struct EqualFcn {
00266 public:
00267 inline size_t operator()(const Trigram &x, const Trigram &y) const
00268 {
00269 return
00270 x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
00271
00272 };
00273 };
00274
00275 public:
00276 TagID tag1;
00277 TagID tag2;
00278 TagID tag3;
00279
00280 public:
00282 Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
00283 : tag1(t1), tag2(t2), tag3(t3)
00284 {};
00285
00287 ~Trigram(void) {};
00288 };
00289
00292 typedef
00293 hash_map<Trigram,ProbT,
00294 Trigram::HashFcn,
00295 Trigram::EqualFcn>
00296 TrigramProbTable;
00297
00298 # else
00299
00300
00318 typedef ProbT* TrigramProbTable;
00319 # endif // MOOT_HASH_TRIGRAMS
00320
00321 #endif // MOOT_USE_TRIGRAMS
00322
00323
00324
00327
00335 class ViterbiNode {
00336 public:
00337 TagID tagid;
00338 #ifdef MOOT_USE_TRIGRAMS
00339 TagID ptagid;
00340 #else
00341 ProbT wprob;
00342 #endif
00343 ProbT lprob;
00344
00345 class ViterbiNode *pth_prev;
00346 class ViterbiNode *nod_next;
00347 };
00348
00349 #ifdef MOOT_USE_TRIGRAMS
00350
00355 class ViterbiRow {
00356 public:
00357 TagID tagid;
00358 ProbT wprob;
00359 class ViterbiNode *nodes;
00360 class ViterbiRow *row_next;
00361 };
00362 #else
00363 typedef ViterbiNode ViterbiRow;
00364 #endif
00365
00366
00373 class ViterbiColumn {
00374 public:
00375 ViterbiRow *rows;
00376 ViterbiColumn *col_prev;
00377 ProbT bbestpr;
00378 ProbT bpprmin;
00379 };
00380
00397 struct ViterbiPathNode {
00398 public:
00399 ViterbiNode *node;
00400 ViterbiPathNode *path_next;
00401 };
00403
00404
00405 public:
00406
00413 int verbose;
00414
00419 size_t ndots;
00420
00424 bool save_ambiguities;
00425
00429 bool save_flavors;
00430
00434 bool save_mark_unknown;
00435
00439 bool save_dump_trellis;
00441
00442
00445
00453 bool use_lex_classes;
00454
00461 TagID start_tagid;
00462
00471 ProbT unknown_lex_threshhold;
00472
00481 ProbT unknown_class_threshhold;
00482
00488 LexClass uclass;
00490
00491
00494 ProbT nglambda1;
00495 ProbT nglambda2;
00496 #ifdef MOOT_USE_TRIGRAMS
00497 ProbT nglambda3;
00498 #endif
00499 ProbT wlambda0;
00500 ProbT wlambda1;
00502 ProbT clambda0;
00503 ProbT clambda1;
00510 ProbT beamwd;
00512
00513
00516 TokIDTable tokids;
00517 TagIDTable tagids;
00518 ClassIDTable classids;
00520
00521 TokID flavids[NTokFlavors];
00523
00524
00527 size_t n_tags;
00528 size_t n_toks;
00529 size_t n_classes;
00531 LexProbTable lexprobs;
00532 LexClassProbTable lcprobs;
00533 #ifdef MOOT_USE_TRIGRAMS
00534 TrigramProbTable ngprobs3;
00535 #else
00536 BigramProbTable ngprobs2;
00537 #endif
00538
00539 SuffixTrie suftrie;
00541
00542
00545 ViterbiColumn *vtable;
00547
00548
00551 size_t nsents;
00552 size_t ntokens;
00553 size_t nnewtokens;
00554 size_t nunclassed;
00555 size_t nnewclasses;
00556 size_t nunknown;
00557 size_t nfallbacks;
00559
00560 protected:
00561
00564 ViterbiNode *trash_nodes;
00565 #ifdef MOOT_USE_TRIGRAMS
00566 ViterbiRow *trash_rows;
00567 #endif
00568 ViterbiColumn *trash_columns;
00569 ViterbiPathNode *trash_pathnodes;
00571
00572
00575 TagID vtagid;
00576 ProbT vbestpr;
00577 ProbT vtagpr;
00578 ProbT vwordpr;
00579 ViterbiNode *vbestpn;
00581 ViterbiPathNode *vbestpath;
00583
00584
00586
00587 public:
00588
00592 mootHMM(void);
00593
00595 ~mootHMM(void) { clear(true,false); };
00597
00598
00606 void clear(bool wipe_everything=true, bool unlogify=false);
00608
00609
00613 bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
00614
00616 bool save(mootio::mostream *obs, const char *filename=NULL);
00617
00619 bool _bindump(mootio::mostream *obs, const char *filename=NULL);
00620
00622 bool load(const char *filename=NULL);
00623
00625 bool load(mootio::mistream *ibs, const char *filename=NULL);
00626
00628 bool _binload(mootio::mistream *ibs, const char *filename=NULL);
00630
00631
00635 inline void unknown_token_name(const mootTokString &name)
00636 {
00637 tokids.unknown_name(name);
00638 };
00639
00641 inline void unknown_tag_name(const mootTokString &name)
00642 {
00643 tagids.unknown_name(name);
00644 };
00645
00646
00647
00648
00649
00650 inline void unknown_class_name(const mootTagSet &tagset)
00651 {
00652 tagset2lexclass(tagset,&uclass,false);
00653 };
00655
00656
00657
00673 bool load_model(const string &modelname,
00674 const mootTagString &start_tag_str="__$",
00675 const char *myname="mootHMM::load_model()",
00676 bool do_estimate_nglambdas=true,
00677 bool do_estimate_wlambdas=true,
00678 bool do_estimate_clambdas=true,
00679 bool do_build_suffix_trie=true,
00680 bool do_compute_logprobs=true);
00681
00687 bool compile(const mootLexfreqs &lexfreqs,
00688 const mootNgrams &ngrams,
00689 const mootClassfreqs &classfreqs,
00690 const mootTagString &start_tag_str="__$");
00691
00693 void assign_ids_lf(const mootLexfreqs &lexfreqs);
00694
00696 void assign_ids_ng(const mootNgrams &ngrams);
00697
00699 void assign_ids_cf(const mootClassfreqs &classfreqs);
00700
00702 void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
00703
00705 bool estimate_lambdas(const mootNgrams &ngrams);
00706
00708 bool estimate_wlambdas(const mootLexfreqs &lf);
00709
00711 bool estimate_clambdas(const mootClassfreqs &cf);
00712
00714 bool build_suffix_trie(const mootLexfreqs &lf,
00715 const mootNgrams &ng,
00716 bool verbose=false)
00717 { return suftrie.build(lf,ng,tagids,start_tagid,verbose); };
00718
00720 bool compute_logprobs(void);
00722
00723
00724
00727
00729 void tag_io(TokenReader *reader, TokenWriter *writer)
00730 {
00731 int rtok;
00732 mootSentence *sent;
00733 while (reader && (rtok = reader->get_sentence()) != TokTypeEOF) {
00734 sent = reader->sentence();
00735 if (!sent) continue;
00736 tag_sentence(*sent);
00737
00738 #ifdef MOOT_DEBUG_ENABLED
00739 if (save_dump_trellis) viterbi_txtdump(writer, sent->size()+1);
00740 #endif
00741
00742 if (writer) writer->put_sentence(*sent);
00743 }
00744 };
00745
00747
00748
00754 inline void tag_sentence(mootSentence &sentence) {
00755 viterbi_clear();
00756 for (mootSentence::const_iterator si = sentence.begin();
00757 si != sentence.end();
00758 si++)
00759 {
00760 viterbi_step(*si);
00761 if (ndots && (ntokens % ndots)==0) fputc('.', stderr);
00762 }
00763 viterbi_finish();
00764 tag_mark_best(sentence);
00765 nsents++;
00766 };
00768
00769
00770
00771
00774
00775
00776
00778 void viterbi_clear(void);
00779
00780
00781
00786 inline void viterbi_step(const mootToken &token) {
00787 if (token.toktype() != TokTypeVanilla) return;
00788 ntokens++;
00789 LexClass tok_class;
00790 for (mootToken::Analyses::const_iterator ani = token.analyses().begin();
00791 ani != token.analyses().end();
00792 ani++)
00793 {
00794 tok_class.insert(tagids.name2id(ani->tag));
00795 }
00796 viterbi_step(token2id(token.text()), tok_class, token.text());
00797 };
00798
00799
00800
00806 inline void viterbi_step(TokID tokid,
00807 const LexClass &lexclass,
00808 const mootTokString &toktext="")
00809 {
00810 if (use_lex_classes) {
00811 if (lexclass.empty()) {
00812 nunclassed++;
00813 viterbi_step(tokid, 0, uclass, toktext);
00814 } else {
00815
00816 ClassID classid = class2id(lexclass,0,1);
00817 viterbi_step(tokid, classid, lexclass, toktext);
00818 }
00819 } else {
00820
00821 if (lexclass.empty()) {
00822 nunclassed++;
00823 viterbi_step(tokid, toktext);
00824 } else {
00825 viterbi_step(tokid, 0, lexclass, toktext);
00826 }
00827 }
00828 };
00829
00830
00831
00836 void viterbi_step(TokID tokid,
00837 ClassID classid,
00838 const LexClass &lclass,
00839 const mootTokString &toktext="");
00840
00841
00842
00849 void viterbi_step(TokID tokid, const mootTokString &toktext="");
00850
00851
00852
00859 inline void viterbi_step(const mootTokString &token_text) {
00860 return viterbi_step(token2id(token_text), token_text);
00861 };
00862
00863
00864
00870 inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
00871 {
00872 LexClass lclass;
00873 tagset2lexclass(tags,&lclass);
00874 viterbi_step(token2id(token_text), lclass, token_text);
00875 };
00876
00877
00878
00882 void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
00883
00884
00885
00891 inline void viterbi_step(const mootTokString &toktext, const mootTagString &tag)
00892 {
00893 return viterbi_step(token2id(toktext), tagids.name2id(tag));
00894 };
00895
00896
00897
00898
00902 inline void viterbi_finish(const TagID final_tagid)
00903 {
00904 viterbi_step(0, final_tagid);
00905 };
00906
00910 inline void viterbi_finish(void)
00911 {
00912 viterbi_step(0, start_tagid);
00913 };
00914
00925 void tag_mark_best(mootSentence &sentence);
00927
00928
00929
00930
00933
00935 inline ViterbiPathNode *viterbi_best_path(void)
00936 {
00937 return viterbi_node_path(viterbi_best_node());
00938 };
00939
00941 inline ViterbiPathNode *viterbi_best_path(TagID tagid)
00942 {
00943 return viterbi_node_path(viterbi_best_node(tagid));
00944 };
00945
00947 inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
00948 {
00949 return viterbi_best_path(tagids.name2id(tagstr));
00950 };
00951
00958 inline ViterbiNode *viterbi_best_node(void)
00959 {
00960 ViterbiNode *pnod;
00961 vbestpr = MOOT_PROB_NEG;
00962 vbestpn = NULL;
00963
00964 #ifdef MOOT_USE_TRIGRAMS
00965 ViterbiRow *prow;
00966 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00967 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
00968 if (pnod->lprob > vbestpr) {
00969 vbestpr = pnod->lprob;
00970 vbestpn = pnod;
00971 }
00972 }
00973 }
00974 #else // !MOOT_USE_TRIGRAMS
00975 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
00976 if (pnod->lprob > vbestpr) {
00977 vbestpr = pnod->lprob;
00978 vbestpn = pnod;
00979 }
00980 }
00981 #endif // MOOT_USE_TRIGRAMS
00982 return vbestpn;
00983 };
00984
00991 inline ViterbiNode *viterbi_best_node(TagID tagid)
00992 {
00993 ViterbiNode *pnod;
00994 vbestpr = MOOT_PROB_NEG;
00995 #ifdef MOOT_USE_TRIGRAMS
00996 ViterbiRow *prow;
00997 vbestpn = NULL;
00998 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
00999 if (prow->tagid == tagid) {
01000 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01001 if (pnod->lprob > vbestpr) {
01002 vbestpr = pnod->lprob;
01003 vbestpn = pnod;
01004 }
01005 }
01006 return vbestpn;
01007 }
01008 }
01009 #else // !MOOT_USE_TRIGRAMS
01010 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01011 if (pnod->tagid == tagid) return pnod;
01012 }
01013 #endif // MOOT_USE_TRIGRAMS
01014 return NULL;
01015 };
01016
01017
01018
01026 inline ViterbiPathNode *viterbi_node_path(ViterbiNode *node)
01027 {
01028 viterbi_clear_bestpath();
01029 ViterbiPathNode *pnod;
01030 for ( ; node != NULL; node = node->pth_prev) {
01031 pnod = viterbi_get_pathnode();
01032 pnod->node = node;
01033 pnod->path_next = vbestpath;
01034 vbestpath = pnod;
01035 }
01036 return vbestpath;
01037 };
01039
01040
01041
01042
01044
01045
01047 inline bool viterbi_column_ok(const ViterbiColumn *col) const {
01048 return (col
01049 && col->rows
01050 #ifdef MOOT_USE_TRIGRAMS
01051 && col->rows->nodes
01052 #endif
01053 );
01054 };
01055
01065 inline ViterbiColumn *viterbi_populate_row(TagID curtagid,
01066 ProbT wordpr=MOOT_PROB_ONE,
01067 ViterbiColumn *col=NULL,
01068 ProbT probmin=MOOT_PROB_NONE)
01069 {
01070 #ifdef MOOT_USE_TRIGRAMS
01071 ViterbiRow *prow, *row = viterbi_get_row();
01072 ViterbiNode *pnod, *nod = NULL;
01073
01074 if (!col) {
01075 col = viterbi_get_column();
01076 col->rows = NULL;
01077 col->bbestpr = MOOT_PROB_NEG;
01078 if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01079 else col->bpprmin = MOOT_PROB_NEG;
01080 }
01081 if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01082 col->col_prev = vtable;
01083 row->nodes = NULL;
01084 row->wprob = wordpr;
01085
01086 for (prow = vtable->rows; prow != NULL; prow = prow->row_next) {
01087 vbestpr = MOOT_PROB_NEG;
01088 vbestpn = NULL;
01089
01090 for (pnod = prow->nodes; pnod != NULL; pnod = pnod->nod_next) {
01091
01092 if (beamwd && pnod->lprob < col->bpprmin) continue;
01093
01094
01095 vtagpr = pnod->lprob + tagp(pnod->ptagid, prow->tagid, curtagid);
01096 if (vtagpr > vbestpr
01097 # ifdef MOOT_LEX_IS_TIEBREAKER
01098 || (vtagpr == vbestpr && wordpr > prow->wprob)
01099 # endif
01100 )
01101 {
01102 vbestpr = vtagpr;
01103 vbestpn = pnod;
01104 }
01105 }
01106
01107
01108 if (vbestpn != NULL) {
01109 nod = viterbi_get_node();
01110 nod->tagid = curtagid;
01111 nod->ptagid = prow->tagid;
01112 nod->lprob = vbestpr + wordpr;
01113 nod->pth_prev = vbestpn;
01114 nod->nod_next = row->nodes;
01115
01116 row->nodes = nod;
01117
01118
01119 if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01120 }
01121 }
01122
01123
01124 row->tagid = curtagid;
01125 row->row_next = col->rows;
01126 col->rows = row;
01127
01128 #else
01129
01130 ViterbiNode *pnod, *nod = NULL;
01131
01132 if (!col) {
01133 col = viterbi_get_column();
01134 col->rows = NULL;
01135 col->bbestpr = MOOT_PROB_NEG;
01136 if (vtable) col->bpprmin = vtable->bbestpr - beamwd;
01137 else col->bpprmin = MOOT_PROB_NEG;
01138 }
01139 if (probmin != MOOT_PROB_NONE) col->bpprmin = probmin;
01140 col->col_prev = vtable;
01141
01142 vbestpr = MOOT_PROB_NEG;
01143 vbestpn = NULL;
01144
01145 for (pnod = vtable->rows; pnod != NULL; pnod = pnod->nod_next) {
01146
01147 if (beamwd && pnod->lprob < col->bpprmin) continue;
01148
01149
01150 vtagpr = pnod->lprob + tagp(pnod->tagid, curtagid);
01151 if (vtagpr > vbestpr
01152 # ifdef MOOT_LEX_IS_TIEBREAKER
01153 || (vtagpr == vbestpr && wordpr > pnod->wprob)
01154 # endif
01155 )
01156 {
01157 vbestpr = vtagpr;
01158 vbestpn = pnod;
01159 }
01160 }
01161
01162
01163 nod = viterbi_get_node();
01164 nod->tagid = curtagid;
01165 nod->wprob = wordpr;
01166 nod->lprob = vbestpr + wordpr;
01167 nod->pth_prev = vbestpn;
01168 nod->nod_next = col->rows;
01169
01170
01171 nod->nod_next = col->rows;
01172 col->rows = nod;
01173
01174
01175 if (nod->lprob > col->bbestpr) col->bbestpr = nod->lprob;
01176
01177 #endif // MOOT_USE_TRIGRAMS
01178
01179 return col;
01180 };
01181
01182
01183
01184
01186 inline void viterbi_clear_bestpath(void)
01187 {
01188
01189 ViterbiPathNode *pnod, *pnod_next;
01190 for (pnod = vbestpath; pnod != NULL; pnod = pnod_next) {
01191 pnod_next = pnod->path_next;
01192 pnod->path_next = trash_pathnodes;
01193 trash_pathnodes = pnod;
01194 }
01195 vbestpath = NULL;
01196 };
01197
01198
01199
01205 void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
01207
01208
01209
01214 inline ViterbiNode *viterbi_get_node(void) {
01215 ViterbiNode *nod;
01216 if (trash_nodes != NULL) {
01217 nod = trash_nodes;
01218 trash_nodes = nod->nod_next;
01219 } else {
01220 nod = new ViterbiNode();
01221 }
01222 return nod;
01223 };
01224
01225
01226
01228 inline ViterbiRow *viterbi_get_row(void) {
01229 #ifdef MOOT_USE_TRIGRAMS
01230 ViterbiRow *row;
01231 if (trash_rows != NULL) {
01232 row = trash_rows;
01233 trash_rows = row->row_next;
01234 } else {
01235 row = new ViterbiRow();
01236 }
01237 return row;
01238 #else
01239 return viterbi_get_node();
01240 #endif //MOOT_USE_TRIGRAMS
01241 };
01242
01243
01244
01246 inline ViterbiColumn *viterbi_get_column(void) {
01247 ViterbiColumn *col;
01248 if (trash_columns != NULL) {
01249 col = trash_columns;
01250 trash_columns = col->col_prev;
01251 } else {
01252 col = new ViterbiColumn();
01253 }
01254 return col;
01255 };
01256
01257
01258
01260 inline ViterbiPathNode *viterbi_get_pathnode(void) {
01261 ViterbiPathNode *pnod;
01262 if (trash_pathnodes != NULL) {
01263 pnod = trash_pathnodes;
01264 trash_pathnodes = pnod->path_next;
01265 } else {
01266 pnod = new ViterbiPathNode();
01267 }
01268 return pnod;
01269 };
01271
01272
01273
01274
01275
01279 inline TokID token2id(const mootTokString &token) const
01280 {
01281 #ifdef MOOT_LEX_NONALPHA
01282 TokID tokid = tokids.name2id(token);
01283 return tokid ? tokid : flavids[tokenFlavor(token)];
01284 #else
01285 mootTokenFlavor flav = tokenFlavor(token);
01286 return flavids[flav]==0 ? tokids.name2id(token) : flavids[flav];
01287 #endif
01288 };
01289
01300 inline LexClass *tagset2lexclass(const mootTagSet &tagset,
01301 LexClass *lclass=NULL,
01302 bool add_tagids=false)
01303 {
01304 if (!lclass) lclass = new LexClass();
01305
01306 for (mootTagSet::const_iterator tsi = tagset.begin();
01307 tsi != tagset.end();
01308 tsi++)
01309 {
01310
01311 TagID tagid = tagids.name2id(*tsi);
01312 if (add_tagids && tagid==0) tagid = tagids.insert(*tsi);
01313
01314
01315 lclass->insert(tagid);
01316 }
01317 return lclass;
01318 };
01319
01320
01326 inline ClassID class2id(const LexClass &lclass,
01327 bool autopopulate=true,
01328 bool autocreate=true)
01329 {
01330 ClassID cid = classids.name2id(lclass);
01331 if (cid == 0) {
01332 nnewclasses++;
01333 if (!autopopulate && !autocreate) return cid;
01334
01335
01336 cid = classids.insert(lclass);
01337 if (cid >= lcprobs.size()) {
01338 n_classes = cid+1;
01339
01340
01341
01342 lcprobs.resize(n_classes);
01343 }
01344 if (autopopulate) {
01345 LexClassProbSubTable &lcps = lcprobs[cid];
01346 if (!lclass.empty()) {
01347
01348 ProbT lcprob = log(1.0/((ProbT)lclass.size()));
01349
01350 for (LexClass::const_iterator lci = lclass.begin(); lci != lclass.end(); lci++) {
01351 lcps[*lci] = lcprob;
01352 }
01353 } else {
01354
01355 const LexProbSubTable &lps = lexprobs[0];
01356 ProbT lpprob = log(1.0/((ProbT)lps.size()));
01357
01358 for (LexProbSubTable::const_iterator lpsi = lps.begin(); lpsi != lps.end(); lpsi++) {
01359 lcps[lpsi->key()] = lpprob;
01360 }
01361 }
01362 }
01363 }
01364 return cid;
01365 };
01367
01368
01369
01372
01373
01374
01375
01377
01378
01379
01380
01381
01382
01383
01384
01385
01386
01387
01388
01389
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411
01412
01413
01418 inline const ProbT wordp(const TokID tokid, const TagID tagid) const
01419 {
01420 if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
01421 const LexProbSubTable &lps = lexprobs[tokid];
01422 LexProbSubTable::const_iterator lpsi = lps.find(tagid);
01423 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01424 };
01425
01432 inline const ProbT wordp(const mootTokString token, const mootTagString tag) const
01433 {
01434 return wordp(token2id(token), tagids.name2id(tag));
01435 };
01436
01437
01438
01439
01443 inline const ProbT classp(const ClassID classid, const TagID tagid) const
01444 {
01445 if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
01446 const LexClassProbSubTable &lps = lcprobs[classid];
01447 LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
01448 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
01449 };
01450
01457 inline const ProbT classp(const LexClass &lclass, const mootTagString tag) const
01458 {
01459 return classp(classids.name2id(lclass), tagids.name2id(tag));
01460 };
01461
01462
01463
01464
01468 inline const ProbT tagp(const TagID tagid) const
01469 {
01470 return
01471 #ifdef MOOT_USE_TRIGRAMS
01472 tagp(0,0,tagid);
01473 #else
01474 ngprobs2 && tagid < n_tags
01475 ? ngprobs2[tagid]
01476 : MOOT_PROB_ZERO;
01477 #endif // MOOT_USE_TRIGRAMS
01478 };
01479
01485 inline const ProbT tagp(const mootTagString &tag) const
01486 {
01487 return tagp(tagids.name2id(tag));
01488 };
01489
01490
01491
01492
01497 inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
01498 {
01499 return
01500 #ifdef MOOT_USE_TRIGRAMS
01501 tagp(0,prevtagid,tagid);
01502 #else
01503 ngprobs2 && prevtagid < n_tags && tagid < n_tags
01504 ? ngprobs2[(n_tags*prevtagid)+tagid]
01505 : MOOT_PROB_ZERO;
01506 #endif
01507 };
01508
01514 inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
01515 {
01516 return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
01517 };
01518
01519
01520
01521
01522 #ifdef MOOT_USE_TRIGRAMS
01523
01529 #ifdef MOOT_HASH_TRIGRAMS
01530 inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
01531 {
01532 TrigramProbTable::const_iterator tgti = ngprobs3.find(trigram);
01533 return tgti != ngprobs3.end() ? tgti->second : ProbZero;
01534 };
01535 #endif //MOOT_HASH_TRIGRAMS
01536
01543 inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
01544 {
01545 return
01546 #ifdef MOOT_HASH_TRIGRAMS
01547 tagp(Trigram(prevtagid2,prevtagid1,tagid))
01548 #else
01549 ngprobs3 && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
01550 ? ngprobs3[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
01551 : MOOT_PROB_ZERO;
01552 #endif
01553 ;
01554 };
01555
01563 inline const ProbT tagp(const mootTagString &prevtag2,
01564 const mootTagString &prevtag1,
01565 const mootTagString &tag)
01566 const
01567 {
01568 return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
01569 };
01570 #endif // MOOT_USE_TRIGRAMS
01571
01572
01573
01574
01575
01576
01580 void carp(char *fmt, ...);
01582
01583
01584
01585
01589 void txtdump(FILE *file);
01590
01592 void viterbi_txtdump(TokenWriter *w, int ncols=0);
01593
01595 void viterbi_txtdump_col(TokenWriter *w, ViterbiColumn *col, int colnum=0);
01597 };
01598
01599 moot_END_NAMESPACE
01600
01601 #endif