mootHMM.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 /*
3  libmoot : moocow's part-of-speech tagging library
4  Copyright (C) 2003-2014 by Bryan Jurish <moocow@cpan.org>
5 
6  This library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Lesser General Public
8  License as published by the Free Software Foundation; either
9  version 3 of the License, or (at your option) any later version.
10 
11  This library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public
17  License along with this library; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20 
21 /*--------------------------------------------------------------------------
22  * File: mootHMM.h
23  * Author: Bryan Jurish <moocow@cpan.org>
24  * Description:
25  * + moot PoS tagger : Hidden Markov Model (Disambiguator): headers
26  *--------------------------------------------------------------------------*/
27 
33 #ifndef _MOOT_HMM_H
34 #define _MOOT_HMM_H
35 
36 #include <math.h>
37 
38 #include <mootFlavor.h>
39 #include <mootTokenIO.h>
40 #include <mootZIO.h>
41 #include <mootBinHeader.h>
42 #include <mootUtils.h>
43 
44 #include <mootClassfreqs.h>
45 //#include <mootLexfreqs.h> //-- included by mootClassfreqs.h
46 
47 #include <mootSuffixTrie.h>
48 /*
49 #ifdef MOOT_ENABLE_SUFFIX_TRIE
50 # include <mootSuffixTrie.h>
51 #else
52 # include <mootEnum.h>
53 # include <mootAssocVector.h>
54 # include <mootTrieVector.h>
55 # include <mootNgrams.h>
56 #endif //--MOOT_ENABLE_SUFFIX_TRUE
57 */
58 
88 #define MOOT_LEX_UNKNOWN_TOKENS
89 //#undef MOOT_LEX_UNKNOWN_TOKENS
90 
97 #define MOOT_LEX_UNKNOWN_CLASSES
98 //#undef MOOT_LEX_UNKNOWN_CLASSES
99 
106 #define MOOT_LEX_NONALPHA
107 //#undef MOOT_LEX_NONALPHA
108 
113 #undef MOOT_LEX_IS_TIEBREAKER
114 
115 moot_BEGIN_NAMESPACE
116 
117 /*--------------------------------------------------------------------------
118  * mootHMM : HMM class
119  *--------------------------------------------------------------------------*/
127 class mootHMM {
128 public:
129  /*---------------------------------------------------------------------*/
132 
134  typedef mootEnumID TagID;
135 
137  typedef mootEnumID TokID;
138 
140  typedef mootEnumID FlavorID;
141 
146  typedef mootEnumID ClassID;
147 
151 
152  /*------------------------------------------------------------
153  * public typedefs : lexical classes
154  */
156 
157 
162  typedef set<TagID> LexClass;
163 
165  struct LexClassHash {
166  public:
167  inline size_t operator()(const LexClass &x) const {
168  size_t hv = 0;
169  for (LexClass::const_iterator xi = x.begin(); xi != x.end(); ++xi) {
170  hv = 5*hv + *xi;
171  }
172  return hv;
173  };
174  };
175 
177  struct LexClassEqual {
178  public:
179  inline size_t operator()(const LexClass &x, const LexClass &y) const {
180  return x==y;
181  };
182  };
184 
185  /*---------------------------------------------------------------------*/
188 
190  typedef mootEnum<mootTagString,
191  moot_hash<mootTagString>,
192  equal_to<mootTagString> >
193  TagIDTable;
194 
196  typedef mootEnum<mootTokString,
197  moot_hash<mootTokString>,
198  equal_to<mootTokString> >
199  TokIDTable;
200 
202  typedef mootEnum<LexClass,
203  LexClassHash,
205  ClassIDTable;
206 
208  //typedef map<TagID,ProbT> LexProbSubTable;
210 
215  typedef LexProbSubTable LexClassProbSubTable;
216 
220  typedef vector<LexProbSubTable> LexProbTable;
221 
236  typedef LexProbTable LexClassProbTable;
237 
246 
248  class Trigram {
249  public:
250  TagID tag1;
251  TagID tag2;
252  TagID tag3;
253  //
254  public:
256  struct HashFcn {
257  public:
258  inline size_t operator()(const Trigram &x) const {
259  return (0xdeece66d * ((0xdeece66d * x.tag1) + x.tag2)) + x.tag3;
260  };
261  };
262  //
264  struct EqualFcn {
265  public:
266  inline size_t operator()(const Trigram &x, const Trigram &y) const {
267  return
268  x.tag1==y.tag1 && x.tag2==y.tag2 && x.tag3==y.tag3;
269  };
270  };
271  //
272  public:
274  Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
275  : tag1(t1), tag2(t2), tag3(t3)
276  {};
277  //
279  ~Trigram(void) {};
280  };
281 
285  typedef
286  hash_map<Trigram,ProbT,
290 
309  typedef ProbT* TrigramProbArray;
310 
311  typedef Trigram NgramProbKey;
312  typedef TrigramProbHash NgramProbHash;
313  typedef TrigramProbArray NgramProbArray;
314 
315 
316  /*---------------------------------------------------------------------*/
319 
327  class ViterbiNode {
328  public:
329  TagID tagid;
330  TagID ptagid;
331  ProbT lprob;
332 
333  class ViterbiNode *pth_prev;
334  class ViterbiNode *nod_next;
335  };
336 
342  class ViterbiRow {
343  public:
344  TagID tagid;
345  ProbT wprob;
346  class ViterbiNode *nodes;
347  class ViterbiRow *row_next;
348  };
356  class ViterbiColumn {
357  public:
358  ViterbiRow *rows;
359  ViterbiColumn *col_prev;
360  ProbT bbestpr;
361  ProbT bpprmin;
362  };
363 
380  struct ViterbiPathNode {
381  public:
382  ViterbiNode *node;
383  ViterbiPathNode *path_next;
384  };
386 
387 
388 public:
389  /*---------------------------------------------------------------------*/
396  int verbose;
397 
402  size_t ndots;
403 
407  bool save_ambiguities;
408 
412  bool save_flavors;
413 
417  bool save_mark_unknown;
419 
420  /*---------------------------------------------------------------------*/
430  bool hash_ngrams;
431 
437  bool relax;
438 
446  bool use_lex_classes;
447 
451  bool use_flavors;
459  TagID start_tagid;
460 
469  ProbT unknown_lex_threshhold;
470 
479  ProbT unknown_class_threshhold;
480 
486  LexClass uclass;
489  /*---------------------------------------------------------------------*/
492  ProbT nglambda1;
493  ProbT nglambda2;
494  ProbT nglambda3;
495  ProbT wlambda0;
496  ProbT wlambda1;
498  ProbT clambda0;
499  ProbT clambda1;
506  ProbT beamwd;
509  /*---------------------------------------------------------------------*/
512  TokIDTable tokids;
513  TagIDTable tagids;
514  ClassIDTable classids;
515  mootTaster taster;
517 
518  /*---------------------------------------------------------------------*/
521  size_t n_tags;
522  size_t n_toks;
523  size_t n_classes;
525  LexProbTable lexprobs;
526  LexClassProbTable lcprobs;
528  NgramProbHash ngprobsh;
529  NgramProbArray ngprobsa;
531 #ifdef MOOT_ENABLE_SUFFIX_TRIE
532  SuffixTrie suftrie;
533 #endif
534 
535 
536  /*---------------------------------------------------------------------*/
539  ViterbiColumn *vtable;
542  /*---------------------------------------------------------------------*/
545  size_t nsents;
546  size_t ntokens;
547  size_t nnewtokens;
548  size_t nunclassed;
549  size_t nnewclasses;
550  size_t nunknown;
551  size_t nfallbacks;
554 protected:
555  /*---------------------------------------------------------------------*/
558  ViterbiNode *trash_nodes;
559  ViterbiRow *trash_rows;
560  ViterbiColumn *trash_columns;
561  ViterbiPathNode *trash_pathnodes;
564  /*---------------------------------------------------------------------*/
567  TagID vtagid;
568  ProbT vbestpr;
569  ProbT vtagpr;
570  ProbT vwordpr;
571  ViterbiNode *vbestpn;
573  ViterbiPathNode *vbestpath;
575  //ProbT bbestpr; /**< Best current (log-)probability for beam pruning */
576  //ProbT bpprmin; /**< Minimum previous probability for beam pruning */
578 
579 public:
580  /*---------------------------------------------------------------------*/
584  mootHMM(void)
585  : verbose(1),
586  ndots(0),
587  save_ambiguities(false),
588  save_flavors(false),
589  save_mark_unknown(false),
590  hash_ngrams(false),
591  relax(true),
592  use_lex_classes(true),
593  use_flavors(true),
594  start_tagid(0),
595  unknown_lex_threshhold(1.0),
596  unknown_class_threshhold(1.0),
597  nglambda1(mootProbEpsilon),
598  nglambda2(1.0 - mootProbEpsilon),
599  wlambda0(mootProbEpsilon),
600  wlambda1(1.0 - mootProbEpsilon),
601  clambda0(mootProbEpsilon),
602  clambda1(1.0 - mootProbEpsilon),
603  beamwd(1000),
604  n_tags(0),
605  n_toks(0),
606  n_classes(0),
607  ngprobsa(NULL),
608  vtable(NULL),
609  nsents(0),
610  ntokens(0),
611  nnewtokens(0),
612  nunclassed(0),
613  nnewclasses(0),
614  nunknown(0),
615  nfallbacks(0),
616  trash_nodes(NULL),
617  trash_rows(NULL),
618  trash_columns(NULL),
619  trash_pathnodes(NULL),
620  vbestpn(NULL),
621  vbestpath(NULL)
622  {
623  //-- create special token entries
624  unknown_token_name("@UNKNOWN");
625  unknown_tag_name("UNKNOWN");
626  uclass = LexClass();
627  };
628 
629 
631  //~mootHMM(void) { clear(true,false); };
632  virtual ~mootHMM(void) { clear(false,false); };
634 
635  /*------------------------------------------------------------*/
643  void clear(bool wipe_everything=true, bool unlogify=false);
645 
646  /*------------------------------------------------------------*/
650  bool save(const char *filename, int compression_level=MOOT_DEFAULT_COMPRESSION);
651 
653  bool save(mootio::mostream *obs, const char *filename=NULL);
654 
656  bool _bindump(mootio::mostream *obs, const mootBinIO::HeaderInfo &hdr, const char *filename=NULL);
657 
659  bool load(const char *filename=NULL);
660 
662  bool load(mootio::mistream *ibs, const char *filename=NULL);
663 
665  bool _binload(mootio::mistream *ibs, const mootBinIO::HeaderInfo &hdr, const char *filename=NULL);
667 
668  /*------------------------------------------------------------*/
672  inline void unknown_token_name(const mootTokString &name)
673  {
674  tokids.unknown_name(name);
675  };
676 
678  inline void unknown_tag_name(const mootTokString &name)
679  {
680  tagids.unknown_name(name);
681  };
682 
687  inline void unknown_class_name(const mootTagSet &tagset)
688  {
689  tagset2lexclass(tagset,&uclass,false);
690  };
692 
694  //------------------------------------------------------------
697 
716  virtual bool load_model(const string &modelname,
717  const mootTagString &start_tag_str="__$",
718  const char *myname="mootHMM::load_model()",
719  bool do_estimate_nglambdas=true,
720  bool do_estimate_wlambdas=true,
721  bool do_estimate_clambdas=true,
722  bool do_build_suffix_trie=true,
723  bool do_compute_logprobs=true);
724 
730  virtual bool compile(const mootLexfreqs &lexfreqs,
731  const mootNgrams &ngrams,
732  const mootClassfreqs &classfreqs,
733  const mootTagString &start_tag_str="__$",
734  const mootTaster &mtaster=builtinTaster);
735 
739  void assign_ids_fl(void);
740 
742  void assign_ids_lf(const mootLexfreqs &lexfreqs);
743 
745  void assign_ids_ng(const mootNgrams &ngrams);
746 
748  void assign_ids_cf(const mootClassfreqs &classfreqs);
749 
751  void compile_unknown_lexclass(const mootClassfreqs &classfreqs);
752 
754  bool estimate_lambdas(const mootNgrams &ngrams);
755 
757  bool estimate_wlambdas(const mootLexfreqs &lf);
758 
760  bool estimate_clambdas(const mootClassfreqs &cf);
761 
763  bool build_suffix_trie(const mootLexfreqs &lf,
764  const mootNgrams &ng,
765  bool verbose=false)
766  {
767 #ifdef MOOT_ENABLE_SUFFIX_TRIE
768  return suftrie.build(lf,ng,tagids,start_tagid,verbose);
769 #else
770  return false;
771 #endif
772  };
773 
775  bool compute_logprobs(void);
776 
778  inline void set_ngram_prob(ProbT p, TagID t1=0, TagID t2=0, TagID t3=0)
779  {
780  if (hash_ngrams) { // +hash
781  ngprobsh[Trigram(t1,t2,t3)] = p;
782  } else { // -hash
783  ngprobsa[(n_tags*((n_tags*t1)+t2))+t3] = p;
784  }
785  };
787 
788  //------------------------------------------------------------
789  // Tagging: Top-level
792 
798  void tag_sentence(mootSentence &sentence);
801  virtual void tag_io(TokenReader *reader, TokenWriter *writer);
802 
804  virtual void tag_stream(TokenReader *reader, TokenWriter *writer);
806 
807  /*====================================================================
808  * VITERBI: Mid-level
809  *====================================================================*/
812 
813  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
815  void viterbi_clear(void);
816 
817  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
818  // Viterbi: single iteration: (mootToken)
823  inline void viterbi_step(const mootToken &token) {
824  if (token.toktype() != TokTypeVanilla) return; //-- ignore non-vanilla tokens
825  ++ntokens;
826  LexClass tok_class;
827  token2lexclass(token, tok_class);
828  viterbi_step(token2id(token.text()), tok_class, token.text());
829  };
830 
831  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
832  // Viterbi: single iteration: (TokID,LexClass=set<ClassID>)
838  inline void viterbi_step(TokID tokid,
839  const LexClass &lexclass,
840  const mootTokString &toktext="")
841  {
842  if (use_lex_classes) {
843  if (lexclass.empty()) {
844  ++nunclassed;
845  viterbi_step(tokid, 0, uclass, toktext);
846  } else {
847  //-- non-empty class : get ID (assign empty distribution if unknown)
848  ClassID classid = class2id(lexclass,0,1);
849  viterbi_step(tokid, classid, lexclass, toktext);
850  }
851  } else {
852  //-- !use_lex_classes
853  if (lexclass.empty()) {
854  ++nunclassed;
855  viterbi_step(tokid, toktext);
856  } else {
857  viterbi_step(tokid, 0, lexclass, toktext);
858  }
859  }
860  };
861 
862  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
863  // Viterbi: single iteration: (TokID,ClassID,LexClass)
868  void viterbi_step(TokID tokid,
869  ClassID classid,
870  const LexClass &lclass,
871  const mootTokString &toktext="");
872 
873  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
874  // Viterbi: single iteration: (TokID)
881  void viterbi_step(TokID tokid, const mootTokString &toktext="");
882 
883  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
884  // Viterbi: single iteration: (TokString)
891  inline void viterbi_step(const mootTokString &token_text) {
892  return viterbi_step(token2id(token_text), token_text);
893  };
894 
895  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
896  // Viterbi: single iteration: (TokString,set<TagString>)
902  inline void viterbi_step(const mootTokString &token_text, const set<mootTagString> &tags)
903  {
904  LexClass lclass;
905  tagset2lexclass(tags,&lclass);
906  viterbi_step(token2id(token_text), lclass, token_text);
907  };
908 
909  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
910  // Viterbi: single iteration: (TokID,TagID,col=NULL)
916  void viterbi_step(TokID tokid, TagID tagid, ViterbiColumn *col=NULL);
917 
918  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
919  // Viterbi: single iteration: (TokString,TagString)
925  inline void viterbi_step(const mootTokString &toktext, const mootTagString &tag)
926  {
927  return viterbi_step(token2id(toktext), tagids.name2id(tag));
928  };
929 
930 
931  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
932  // Viterbi: finish
936  inline void viterbi_finish(const TagID final_tagid)
937  {
938  viterbi_step(0, final_tagid);
939  };
940 
941  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
945  inline void viterbi_finish(void)
946  {
947  viterbi_step(0, start_tagid);
948  };
949 
950  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
951  // Viterbi: finish
955  void viterbi_flush(TokenWriter *writer, mootSentence &toks, ViterbiNode *nod);
956 
957  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
971  inline void tag_mark_best(mootSentence &sentence)
972  {
973  tag_mark_best(viterbi_best_path(), sentence);
974  };
975 
989  //@param skip_first if true (default), the first path node will be skipped (it's usually an implicit BOS marker)
990  void tag_mark_best(ViterbiPathNode *pnod, mootSentence &sentence);
991 
992  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
998  void tag_dump_trace(mootSentence &sentence, bool dumpPredict=false);
999 
1001 
1002 
1003  //------------------------------------------------------------
1004  // Viterbi: Low-Level: path utilities
1005 
1008 
1009  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1011  inline ViterbiPathNode *viterbi_best_path(void)
1012  {
1013  return viterbi_node_path(viterbi_best_node());
1014  };
1015 
1016  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1018  inline ViterbiPathNode *viterbi_best_path(TagID tagid)
1019  {
1020  return viterbi_node_path(viterbi_best_node(tagid));
1021  };
1022 
1023  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1025  inline ViterbiPathNode *viterbi_best_path(const mootTagString &tagstr)
1026  {
1027  return viterbi_best_path(tagids.name2id(tagstr));
1028  };
1029 
1030  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1037  ViterbiNode *viterbi_best_node(void);
1038 
1039  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1046  ViterbiNode *viterbi_best_node(TagID tagid);
1047 
1048  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1057  ViterbiNode* viterbi_flushable_node(void);
1058 
1059  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1067  ViterbiPathNode *viterbi_node_path(ViterbiNode *node);
1069 
1070  //------------------------------------------------------------
1071  // Viterbi: low-level: iteration
1072 
1074  //{@
1075 
1076  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1078  inline bool viterbi_column_ok(const ViterbiColumn *col) const
1079  {
1080  return (col && col->rows && col->rows->nodes);
1081  };
1082 
1083  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1093  ViterbiColumn *viterbi_populate_row(TagID curtagid,
1094  ProbT wordpr =MOOT_PROB_ONE,
1095  ViterbiColumn *col =NULL,
1096  ProbT probmin =MOOT_PROB_NONE);
1097 
1098 
1099  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1101  void viterbi_clear_bestpath(void);
1102 
1103  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1109  void _viterbi_step_fallback(TokID tokid, ViterbiColumn *col);
1111 
1112 
1113  //------------------------------------------------------------
1117 
1118  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1120  inline ViterbiNode *viterbi_get_node(void)
1121  {
1122  ViterbiNode *nod;
1123  if (trash_nodes != NULL) {
1124  nod = trash_nodes;
1125  trash_nodes = nod->nod_next;
1126  } else {
1127  nod = new ViterbiNode();
1128  }
1129  return nod;
1130  };
1131 
1132  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1134  inline ViterbiRow *viterbi_get_row(void)
1135  {
1136  ViterbiRow *row;
1137  if (trash_rows != NULL) {
1138  row = trash_rows;
1139  trash_rows = row->row_next;
1140  } else {
1141  row = new ViterbiRow();
1142  }
1143  return row;
1144  };
1146  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1148  inline ViterbiColumn *viterbi_get_column(void)
1149  {
1150  ViterbiColumn *col;
1151  if (trash_columns != NULL) {
1152  col = trash_columns;
1153  trash_columns = col->col_prev;
1154  } else {
1155  col = new ViterbiColumn();
1156  }
1157  return col;
1158  };
1159 
1160  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1162  inline ViterbiPathNode *viterbi_get_pathnode(void)
1163  {
1164  ViterbiPathNode *pnod;
1165  if (trash_pathnodes != NULL) {
1166  pnod = trash_pathnodes;
1167  trash_pathnodes = pnod->path_next;
1168  } else {
1169  pnod = new ViterbiPathNode();
1170  }
1171  return pnod;
1172  };
1174 
1175 
1176  //------------------------------------------------------------
1177  // Low-level: ID Lookup
1180 
1181  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1183  inline TokID token2id(const mootTokString &token) const
1184  {
1185 #ifdef MOOT_LEX_NONALPHA
1186  TokID tokid = tokids.name2id(token);
1187  return tokid || !use_flavors ? tokid : taster.flavor_id(token);
1188 #else
1189  TokID tokid = use_flavors ? taster.flavor_id(token) : 0;
1190  return tokid ? tokid : tokids.name2id(token);
1191 #endif
1192  };
1194  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1196  void token2lexclass(const mootToken &token, LexClass &tok_class) const;
1197 
1198  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1209  LexClass *tagset2lexclass(const mootTagSet &tagset, LexClass *lclass=NULL, bool add_tagids=false);
1210 
1211 
1218  ClassID class2id(const LexClass &lclass, bool autopopulate=true, bool autocreate=true);
1220 
1221 
1222  //------------------------------------------------------------
1225 
1226  /*------------------------------------------------------------
1227  * Lexical Probability Lookup
1228  */
1229 
1234  inline const ProbT wordp(const TokID tokid, const TagID tagid) const
1235  {
1236  if (tokid >= lexprobs.size()) return MOOT_PROB_ZERO;
1237  const LexProbSubTable &lps = lexprobs[tokid];
1238  LexProbSubTable::const_iterator lpsi = lps.find(tagid);
1239  return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
1240  };
1241 
1248  inline const ProbT wordp(const mootTokString &tokstr, const mootTagString &tagstr) const
1249  {
1250  return wordp(token2id(tokstr), tagids.name2id(tagstr));
1251  };
1252 
1253  /*------------------------------------------------------------
1254  * Lexical-Class Probability Lookup
1255  */
1259  inline const ProbT classp(const ClassID classid, const TagID tagid) const
1260  {
1261  if (classid >= lcprobs.size()) return MOOT_PROB_ZERO;
1262  const LexClassProbSubTable &lps = lcprobs[classid];
1263  LexClassProbSubTable::const_iterator lpsi = lps.find(tagid);
1264  return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
1265  };
1266 
1273  inline const ProbT classp(const LexClass &lclass, const mootTagString &tagstr) const
1274  {
1275  return classp(classids.name2id(lclass), tagids.name2id(tagstr));
1276  };
1277 
1278  /*------------------------------------------------------------
1279  * Unigram Probability Lookup
1280  */
1284  inline const ProbT tagp(const TagID tagid) const
1285  {
1286  return tagp(0,0,tagid);
1287  };
1288 
1292  inline const ProbT tagp(const mootTagString &tag) const
1293  {
1294  return tagp(tagids.name2id(tag));
1295  };
1296 
1297  /*------------------------------------------------------------
1298  * Bigram Probability Lookup
1299  */
1304  inline const ProbT tagp(const TagID prevtagid, const TagID tagid) const
1305  {
1306  return tagp(0,prevtagid,tagid);
1307  };
1312  inline const ProbT tagp(const mootTagString &prevtag, const mootTagString &tag) const
1313  {
1314  return tagp(tagids.name2id(prevtag), tagids.name2id(tag));
1315  };
1316 
1317  /*------------------------------------------------------------
1318  * Trigram probability lookup
1319  */
1324  inline const ProbT tagp(const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO) const
1325  {
1326  if (!hash_ngrams) return tagp(trigram.tag1, trigram.tag2, trigram.tag3);
1327  NgramProbHash::const_iterator ngpi = ngprobsh.find(trigram);
1328  return ngpi != ngprobsh.end() ? ngpi->second : ProbZero;
1329  };
1330 
1336  inline const ProbT tagp(const TagID prevtagid2, const TagID prevtagid1, const TagID tagid) const
1337  {
1338  if (!hash_ngrams) { //-- -hash
1339  return
1340  ngprobsa && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
1341  ? ngprobsa[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
1342  : MOOT_PROB_ZERO;
1343  } else { //-- +hash
1344  //-- trigram as stored (pre-smoothed)
1345  Trigram ng(prevtagid2,prevtagid1,tagid);
1346  ProbT p = tagp(ng, 1);
1347  if (p != 1) return p;
1348  //-- fallback: bigram as stored (pre-smoothed)
1349  ng.tag1 = 0;
1350  p = tagp(ng, 1);
1351  if (p != 1) return p;
1352  //-- fallback: unigram as stored (pre-smoothed)
1353  ng.tag2 = 0;
1354  p = tagp(ng, 1);
1355  if (p != 1) return p;
1356  //-- fallback: "unknown" unigram (pre-smoothed)
1357  ng.tag3 = 0;
1358  return tagp(ng, MOOT_PROB_ZERO);
1359  }
1360  };
1361 
1367  inline const ProbT tagp(const mootTagString &prevtag2,
1368  const mootTagString &prevtag1,
1369  const mootTagString &tag)
1370  const
1371  {
1372  return tagp(tagids.name2id(prevtag2), tagids.name2id(prevtag1), tagids.name2id(tag));
1373  };
1375 
1376 
1377  //------------------------------------------------------------
1378  // Error Reporting
1379 
1383  void carp(const char *fmt, ...);
1385 
1386  //------------------------------------------------------------
1387  // public methods: low-level: debugging
1388 
1392  void txtdump(FILE *file, bool dump_constants=true, bool dump_lexprobs=true, bool dump_classprobs=true, bool dump_suftrie=true, bool dump_ngprobs=true);
1393 
1397  void viterbi_txtdump(TokenWriter *w, int ncols=0);
1398 
1402  void viterbi_txtdump_col(TokenWriter *w, ViterbiColumn *col, int colnum=0);
1404 };
1405 
1406 moot_END_NAMESPACE
1407 
1408 #endif /* _MOOT_HMM_H */
ProbT * TrigramProbArray
Type for uni-, bi- and trigram probability lookup table.
Definition: mootHMM.h:302
iterator find(const KeyT &key)
Definition: mootAssocVector.h:194
mootTokenType toktype(void) const
Definition: mootToken.h:453
assoc_vector_type::const_iterator const_iterator
Definition: mootAssocVector.h:141
1st-order Hidden Markov Model Tagger/Disambiguator class.
Definition: mootHMM.h:120
useful utilities, especially for command-line programs
High-level heuristic token classifier .
Definition: mootFlavor.h:62
HMM training data: lexical-class frequencies: raw.
LexProbTable LexClassProbTable
Definition: mootHMM.h:229
Top-level class for suffix tries.
Definition: mootSuffixTrie.h:46
Definition: mootHMM.h:158
abstract binary header class
ViterbiRow * rows
Column rows.
Definition: mootHMM.h:351
mootEnumID FlavorID
Definition: mootHMM.h:133
mootEnumID TokID
Definition: mootHMM.h:130
Type for a Viterbi trellis row ("current tag") node.
Definition: mootHMM.h:335
Definition: mootHMM.h:170
vector< LexProbSubTable > LexProbTable
Definition: mootHMM.h:213
Abstract class for token input.
Definition: mootTokenIO.h:208
mootio I/O abstraction layer for zlib gzFile
TrigramProbHash NgramProbHash
Generic n-gram probabilities: trigrams, hashed.
Definition: mootHMM.h:305
classes and utilities for regex-based token "flavor" heuristics
mootEnumID name2id(const NameType &name) const
Definition: mootEnum.h:131
TagID tag1
previous-previous tag_{i-2} or 0
Definition: mootHMM.h:243
Utility struct for hash_map.
Definition: mootHMM.h:257
Abstract base class for output stream wrappers.
Definition: mootIO.h:194
Class for storage and retrieval of raw lexical-class frequencies.
Definition: mootClassfreqs.h:44
set< mootTagString > mootTagSet
Definition: mootToken.h:65
LISP-style assoc list using vector<>: map-like class with small memory footprint. Useful for small as...
Definition: mootAssocVector.h:130
Definition: mootEnum.h:67
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
High-level token information object.
Definition: mootToken.h:96
TrigramProbArray NgramProbArray
Generic n-gram probabilities: trigrams, dense.
Definition: mootHMM.h:306
Tag-trigram key type for HMM probability lookup table (only used if hash_ngrams is true) ...
Definition: mootHMM.h:241
mootFlavorID flavor_id(const char *s) const
Definition: mootFlavor.h:240
void unknown_name(const NameType &name)
Definition: mootEnum.h:102
TagID tag3
current tag: tag_i
Definition: mootHMM.h:245
Class for storage and retrieval of raw lexical frequencies.
Definition: mootLexfreqs.h:44
hash_map< Trigram, ProbT, Trigram::HashFcn, Trigram::EqualFcn > TrigramProbHash
Definition: mootHMM.h:282
float ProbT
Definition: mootTypes.h:63
VerbosityLevel
Definition: mootUtils.h:357
Type for a Viterbi trellis column.
Definition: mootHMM.h:349
mootEnumID ClassID
Definition: mootHMM.h:139
const mootTaster builtinTaster
ViterbiPathNode * path_next
Definition: mootHMM.h:376
Header information structure, used for binary HMM model files.
Definition: mootBinHeader.h:43
Utility struct for hash_map.
Definition: mootHMM.h:249
class ViterbiNode * nodes
Trellis "pillar" node(s) for this row.
Definition: mootHMM.h:339
list< mootToken > mootSentence
Definition: mootToken.h:630
Abstract class for token output.
Definition: mootTokenIO.h:700
Abstract and native classes for I/O of moot::mootToken objects.
bool build(const mootLexfreqs &lf, const mootNgrams &ng, const TagIDTable &tagids, TagID eos_tagid, bool verbose=false)
string mootTagString
Definition: mootToken.h:59
suffix tries (experimental, optional)
ProbT * UnigramProbTable
Definition: mootHMM.h:238
class ViterbiNode * nod_next
Next previous-tag-node in current pillar.
Definition: mootHMM.h:327
set< TagID > LexClass
Definition: mootHMM.h:155
const mootTokString & text(void) const
Definition: mootToken.h:408
Definition: mootHMM.h:373
class ViterbiRow * row_next
Next row.
Definition: mootHMM.h:340
ViterbiColumn * col_prev
Previous column.
Definition: mootHMM.h:352
Definition: mootToken.h:74
TagID tag2
previous tag: tag_{i-1} or 0
Definition: mootHMM.h:244
moot::UInt mootEnumID
Definition: mootEnum.h:45
Abstract base class for input stream wrappers.
Definition: mootIO.h:129
string mootTokString
Definition: mootToken.h:62
mootEnumID TagID
Definition: mootHMM.h:127
Type for a Viterbi trellis entry ("pillar") node.
Definition: mootHMM.h:320