88 #define MOOT_LEX_UNKNOWN_TOKENS 97 #define MOOT_LEX_UNKNOWN_CLASSES 106 #define MOOT_LEX_NONALPHA 113 #undef MOOT_LEX_IS_TIEBREAKER 167 inline size_t operator()(
const LexClass &x)
const {
169 for (LexClass::const_iterator xi = x.begin(); xi != x.end(); ++xi) {
179 inline size_t operator()(
const LexClass &x,
const LexClass &y)
const {
191 moot_hash<mootTagString>,
192 equal_to<mootTagString> >
197 moot_hash<mootTokString>,
198 equal_to<mootTokString> >
258 inline size_t operator()(
const Trigram &x)
const {
266 inline size_t operator()(
const Trigram &x,
const Trigram &y)
const {
274 Trigram(TagID t1=0, TagID t2=0, TagID t3=0)
275 : tag1(t1), tag2(t2), tag3(t3)
407 bool save_ambiguities;
417 bool save_mark_unknown;
446 bool use_lex_classes;
469 ProbT unknown_lex_threshhold;
479 ProbT unknown_class_threshhold;
514 ClassIDTable classids;
525 LexProbTable lexprobs;
526 LexClassProbTable lcprobs;
528 NgramProbHash ngprobsh;
529 NgramProbArray ngprobsa;
531 #ifdef MOOT_ENABLE_SUFFIX_TRIE 587 save_ambiguities(false),
589 save_mark_unknown(false),
592 use_lex_classes(true),
595 unknown_lex_threshhold(1.0),
596 unknown_class_threshhold(1.0),
597 nglambda1(mootProbEpsilon),
598 nglambda2(1.0 - mootProbEpsilon),
599 wlambda0(mootProbEpsilon),
600 wlambda1(1.0 - mootProbEpsilon),
601 clambda0(mootProbEpsilon),
602 clambda1(1.0 - mootProbEpsilon),
619 trash_pathnodes(NULL),
624 unknown_token_name(
"@UNKNOWN");
625 unknown_tag_name(
"UNKNOWN");
632 virtual ~
mootHMM(
void) { clear(
false,
false); };
643 void clear(
bool wipe_everything=
true,
bool unlogify=
false);
650 bool save(
const char *filename,
int compression_level=MOOT_DEFAULT_COMPRESSION);
659 bool load(
const char *filename=NULL);
672 inline void unknown_token_name(
const mootTokString &name)
678 inline void unknown_tag_name(
const mootTokString &name)
687 inline void unknown_class_name(
const mootTagSet &tagset)
689 tagset2lexclass(tagset,&uclass,
false);
716 virtual bool load_model(
const string &modelname,
717 const mootTagString &start_tag_str=
"__$",
718 const char *myname=
"mootHMM::load_model()",
719 bool do_estimate_nglambdas=
true,
720 bool do_estimate_wlambdas=
true,
721 bool do_estimate_clambdas=
true,
722 bool do_build_suffix_trie=
true,
723 bool do_compute_logprobs=
true);
733 const mootTagString &start_tag_str=
"__$",
739 void assign_ids_fl(
void);
754 bool estimate_lambdas(
const mootNgrams &ngrams);
767 #ifdef MOOT_ENABLE_SUFFIX_TRIE 768 return suftrie.
build(lf,ng,tagids,start_tagid,verbose);
775 bool compute_logprobs(
void);
778 inline void set_ngram_prob(ProbT p, TagID t1=0, TagID t2=0, TagID t3=0)
781 ngprobsh[Trigram(t1,t2,t3)] = p;
783 ngprobsa[(n_tags*((n_tags*t1)+t2))+t3] = p;
815 void viterbi_clear(
void);
823 inline void viterbi_step(
const mootToken &token) {
827 token2lexclass(token, tok_class);
828 viterbi_step(token2id(token.
text()), tok_class, token.
text());
838 inline void viterbi_step(TokID tokid,
839 const LexClass &lexclass,
840 const mootTokString &toktext=
"")
842 if (use_lex_classes) {
843 if (lexclass.empty()) {
845 viterbi_step(tokid, 0, uclass, toktext);
848 ClassID classid = class2id(lexclass,0,1);
849 viterbi_step(tokid, classid, lexclass, toktext);
853 if (lexclass.empty()) {
855 viterbi_step(tokid, toktext);
857 viterbi_step(tokid, 0, lexclass, toktext);
868 void viterbi_step(TokID tokid,
870 const LexClass &lclass,
871 const mootTokString &toktext=
"");
881 void viterbi_step(TokID tokid,
const mootTokString &toktext=
"");
891 inline void viterbi_step(
const mootTokString &token_text) {
892 return viterbi_step(token2id(token_text), token_text);
902 inline void viterbi_step(
const mootTokString &token_text,
const set<mootTagString> &tags)
905 tagset2lexclass(tags,&lclass);
906 viterbi_step(token2id(token_text), lclass, token_text);
916 void viterbi_step(TokID tokid, TagID tagid,
ViterbiColumn *col=NULL);
925 inline void viterbi_step(
const mootTokString &toktext,
const mootTagString &tag)
927 return viterbi_step(token2id(toktext), tagids.
name2id(tag));
936 inline void viterbi_finish(
const TagID final_tagid)
938 viterbi_step(0, final_tagid);
945 inline void viterbi_finish(
void)
947 viterbi_step(0, start_tagid);
973 tag_mark_best(viterbi_best_path(), sentence);
998 void tag_dump_trace(
mootSentence &sentence,
bool dumpPredict=
false);
1013 return viterbi_node_path(viterbi_best_node());
1020 return viterbi_node_path(viterbi_best_node(tagid));
1025 inline ViterbiPathNode *viterbi_best_path(
const mootTagString &tagstr)
1027 return viterbi_best_path(tagids.
name2id(tagstr));
1078 inline bool viterbi_column_ok(
const ViterbiColumn *col)
const 1094 ProbT wordpr =MOOT_PROB_ONE,
1096 ProbT probmin =MOOT_PROB_NONE);
1101 void viterbi_clear_bestpath(
void);
1109 void _viterbi_step_fallback(TokID tokid,
ViterbiColumn *col);
1123 if (trash_nodes != NULL) {
1137 if (trash_rows != NULL) {
1151 if (trash_columns != NULL) {
1152 col = trash_columns;
1165 if (trash_pathnodes != NULL) {
1166 pnod = trash_pathnodes;
1183 inline TokID token2id(
const mootTokString &token)
const 1185 #ifdef MOOT_LEX_NONALPHA 1186 TokID tokid = tokids.
name2id(token);
1187 return tokid || !use_flavors ? tokid : taster.
flavor_id(token);
1189 TokID tokid = use_flavors ? taster.
flavor_id(token) : 0;
1190 return tokid ? tokid : tokids.
name2id(token);
1196 void token2lexclass(
const mootToken &token, LexClass &tok_class)
const;
1209 LexClass *tagset2lexclass(
const mootTagSet &tagset, LexClass *lclass=NULL,
bool add_tagids=
false);
1218 ClassID class2id(
const LexClass &lclass,
bool autopopulate=
true,
bool autocreate=
true);
1234 inline const ProbT wordp(
const TokID tokid,
const TagID tagid)
const 1236 if (tokid >= lexprobs.size())
return MOOT_PROB_ZERO;
1237 const LexProbSubTable &lps = lexprobs[tokid];
1239 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
1248 inline const ProbT wordp(
const mootTokString &tokstr,
const mootTagString &tagstr)
const 1250 return wordp(token2id(tokstr), tagids.
name2id(tagstr));
1259 inline const ProbT classp(
const ClassID classid,
const TagID tagid)
const 1261 if (classid >= lcprobs.size())
return MOOT_PROB_ZERO;
1262 const LexClassProbSubTable &lps = lcprobs[classid];
1264 return lpsi != lps.end() ? lpsi->value() : MOOT_PROB_ZERO;
1273 inline const ProbT classp(
const LexClass &lclass,
const mootTagString &tagstr)
const 1284 inline const ProbT tagp(
const TagID tagid)
const 1286 return tagp(0,0,tagid);
1292 inline const ProbT tagp(
const mootTagString &tag)
const 1304 inline const ProbT tagp(
const TagID prevtagid,
const TagID tagid)
const 1306 return tagp(0,prevtagid,tagid);
1312 inline const ProbT tagp(
const mootTagString &prevtag,
const mootTagString &tag)
const 1324 inline const ProbT tagp(
const Trigram &trigram, ProbT ProbZero=MOOT_PROB_ZERO)
const 1326 if (!hash_ngrams)
return tagp(trigram.
tag1, trigram.
tag2, trigram.
tag3);
1327 NgramProbHash::const_iterator ngpi = ngprobsh.find(trigram);
1328 return ngpi != ngprobsh.end() ? ngpi->second : ProbZero;
1336 inline const ProbT tagp(
const TagID prevtagid2,
const TagID prevtagid1,
const TagID tagid)
const 1340 ngprobsa && prevtagid2 < n_tags && prevtagid1 < n_tags && tagid < n_tags
1341 ? ngprobsa[(n_tags*((n_tags*prevtagid2)+prevtagid1))+tagid]
1345 Trigram ng(prevtagid2,prevtagid1,tagid);
1346 ProbT p = tagp(ng, 1);
1347 if (p != 1)
return p;
1351 if (p != 1)
return p;
1355 if (p != 1)
return p;
1358 return tagp(ng, MOOT_PROB_ZERO);
1367 inline const ProbT tagp(
const mootTagString &prevtag2,
1368 const mootTagString &prevtag1,
1369 const mootTagString &tag)
1383 void carp(
const char *fmt, ...);
1392 void txtdump(FILE *file,
bool dump_constants=
true,
bool dump_lexprobs=
true,
bool dump_classprobs=
true,
bool dump_suftrie=
true,
bool dump_ngprobs=
true);
1397 void viterbi_txtdump(
TokenWriter *w,
int ncols=0);
ProbT * TrigramProbArray
Type for uni-, bi- and trigram probability lookup table.
Definition: mootHMM.h:302
iterator find(const KeyT &key)
Definition: mootAssocVector.h:194
mootTokenType toktype(void) const
Definition: mootToken.h:453
assoc_vector_type::const_iterator const_iterator
Definition: mootAssocVector.h:141
1st-order Hidden Markov Model Tagger/Disambiguator class.
Definition: mootHMM.h:120
useful utilities, especially for command-line programs
High-level heuristic token classifier .
Definition: mootFlavor.h:62
HMM training data: lexical-class frequencies: raw.
LexProbTable LexClassProbTable
Definition: mootHMM.h:229
Top-level class for suffix tries.
Definition: mootSuffixTrie.h:46
Definition: mootHMM.h:158
ViterbiRow * rows
Column rows.
Definition: mootHMM.h:351
mootEnumID FlavorID
Definition: mootHMM.h:133
mootEnumID TokID
Definition: mootHMM.h:130
Type for a Viterbi trellis row ("current tag") node.
Definition: mootHMM.h:335
Definition: mootHMM.h:170
vector< LexProbSubTable > LexProbTable
Definition: mootHMM.h:213
Abstract class for token input.
Definition: mootTokenIO.h:208
mootio I/O abstraction layer for zlib gzFile
TrigramProbHash NgramProbHash
Generic n-gram probabilities: trigrams, hashed.
Definition: mootHMM.h:305
classes and utilities for regex-based token "flavor" heuristics
mootEnumID name2id(const NameType &name) const
Definition: mootEnum.h:131
TagID tag1
previous-previous tag_{i-2} or 0
Definition: mootHMM.h:243
Utility struct for hash_map.
Definition: mootHMM.h:257
Abstract base class for output stream wrappers.
Definition: mootIO.h:194
Class for storage and retrieval of raw lexical-class frequencies.
Definition: mootClassfreqs.h:44
set< mootTagString > mootTagSet
Definition: mootToken.h:65
LISP-style assoc list using vector<>: map-like class with small memory footprint. Useful for small as...
Definition: mootAssocVector.h:130
Definition: mootEnum.h:67
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
High-level token information object.
Definition: mootToken.h:96
TrigramProbArray NgramProbArray
Generic n-gram probabilities: trigrams, dense.
Definition: mootHMM.h:306
Tag-trigram key type for HMM probability lookup table (only used if hash_ngrams is true) ...
Definition: mootHMM.h:241
mootFlavorID flavor_id(const char *s) const
Definition: mootFlavor.h:240
void unknown_name(const NameType &name)
Definition: mootEnum.h:102
TagID tag3
current tag: tag_i
Definition: mootHMM.h:245
Class for storage and retrieval of raw lexical frequencies.
Definition: mootLexfreqs.h:44
hash_map< Trigram, ProbT, Trigram::HashFcn, Trigram::EqualFcn > TrigramProbHash
Definition: mootHMM.h:282
float ProbT
Definition: mootTypes.h:63
VerbosityLevel
Definition: mootUtils.h:357
Type for a Viterbi trellis column.
Definition: mootHMM.h:349
mootEnumID ClassID
Definition: mootHMM.h:139
const mootTaster builtinTaster
ViterbiPathNode * path_next
Definition: mootHMM.h:376
Utility struct for hash_map.
Definition: mootHMM.h:249
class ViterbiNode * nodes
Trellis "pillar" node(s) for this row.
Definition: mootHMM.h:339
list< mootToken > mootSentence
Definition: mootToken.h:630
Abstract class for token output.
Definition: mootTokenIO.h:700
Abstract and native classes for I/O of moot::mootToken objects.
bool build(const mootLexfreqs &lf, const mootNgrams &ng, const TagIDTable &tagids, TagID eos_tagid, bool verbose=false)
string mootTagString
Definition: mootToken.h:59
suffix tries (experimental, optional)
ProbT * UnigramProbTable
Definition: mootHMM.h:238
class ViterbiNode * nod_next
Next previous-tag-node in current pillar.
Definition: mootHMM.h:327
set< TagID > LexClass
Definition: mootHMM.h:155
const mootTokString & text(void) const
Definition: mootToken.h:408
Definition: mootHMM.h:373
class ViterbiRow * row_next
Next row.
Definition: mootHMM.h:340
ViterbiColumn * col_prev
Previous column.
Definition: mootHMM.h:352
Definition: mootToken.h:74
TagID tag2
previous tag: tag_{i-1} or 0
Definition: mootHMM.h:244
moot::UInt mootEnumID
Definition: mootEnum.h:45
Abstract base class for input stream wrappers.
Definition: mootIO.h:129
string mootTokString
Definition: mootToken.h:62
mootEnumID TagID
Definition: mootHMM.h:127
Type for a Viterbi trellis entry ("pillar") node.
Definition: mootHMM.h:320