mootDynHMM.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 /*
3  libmoot : moocow's part-of-speech tagging library
4  Copyright (C) 2009 by Bryan Jurish <moocow@cpan.org>
5 
6  This library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Lesser General Public
8  License as published by the Free Software Foundation; either
9  version 3 of the License, or (at your option) any later version.
10 
11  This library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public
17  License along with this library; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20 
21 /*--------------------------------------------------------------------------
22  * File: mootDynHMM.h
23  * Author: Bryan Jurish <moocow@cpan.org>
24  * Description:
25  * + moot PoS tagger : HMM: for large "tag"-sets with dynamic lexical probabilities
26  *--------------------------------------------------------------------------*/
27 
33 #ifndef _MOOT_DYNHMM_H
34 #define _MOOT_DYNHMM_H
35 
36 #include <mootHMM.h>
37 #include <assert.h>
38 
39 moot_BEGIN_NAMESPACE
40 
41 
42 /*======================================================================
43  *Generic Utilities
44  */
45 
47 typedef enum {
54 
57 public:
58  //-- mootDynLexHMM
59  bool invert_lexp;
60  std::string newtag_str;
62  //
63  //-- mootDynLexHMM_Boltzmann
66  //
67  //-- mootMIParser
68  bool text_tags;
70 public:
72  : invert_lexp(true),
73  newtag_str("@NEW"),
74  Ftw_eps(0.5),
75  dynlex_base(2.0),
76  dynlex_beta(1.0),
77  text_tags(false)
78  {};
79 
81 };
82 
85 
87 class mootDynHMM *newDynHMM(const std::string &which="Freq", const mootDynHMMOptions &opts=mootDynHMMOptions());
88 
89 
90 /*======================================================================
91  * mootDynHMM : "dynamic" HMM class
92  */
93 
97 class mootDynHMM: public mootHMM {
98 public:
99  /*---------------------------------------------------------------------*/
100  /* Types */
101 
102 public:
103  /*---------------------------------------------------------------------*/
104  /* Data */
105 
106 public:
107  /*---------------------------------------------------------------------*/
112  { hash_ngrams=true; };
113 
115  virtual ~mootDynHMM(void) {};
116 
118  virtual void set_options(const mootDynHMMOptions &opts)
119  {};
121 
122  //------------------------------------------------------------
123  // Utilities
125 
126  inline void lex_clear(void) {
127  lexprobs.clear();
128  tokids.clear(); //-- leaves '@UNKNOWN' tok entry
129  lexprobs.resize(1); //-- re-insert empty entry for '@UNKNOWN' tok
130  n_toks = tokids.size(); //-- sanity check
131  };
132 
134  inline void lex_resize(TokID tokid_max) {
135  if (n_toks <= tokid_max) {
136  tokids.resize(tokid_max+1);
137  lexprobs.resize(tokid_max+1);
138  n_toks = tokids.size();
139  }
140  };
141 
144  inline TokID lex_get_tokid(const mootTokString &tok_text)
145  {
146  TokID tokid = tokids.get_id(tok_text);
147  lex_resize(tokid);
148  return tokid;
149  };
150 
154  inline void tagset_resize(TagID tagid_max) {
155  assert(hash_ngrams==true);
156  if (n_tags <= tagid_max) {
157  tagids.resize(tagid_max+1);
158  n_tags = tagids.size();
159  }
160  };
161 
165  inline TagID get_tagid(const mootTagString &tagstr)
166  {
167  if (tagids.nameExists(tagstr))
168  return tagids.name2id(tagstr);
169  TagID tagid = tagids.get_id(tagstr);
170  tagset_resize(tagid);
171  return tagid;
172  };
174 
175  //------------------------------------------------------------
178 
183  virtual void tag_hook_pre(mootSentence &sent)
184  {};
185 
190  virtual void tag_hook_post(mootSentence &sent)
191  {};
193 
194  /*---------------------------------------------------------------------*/
197 
199  virtual void tw_put_info(moot::TokenWriter *tw)
200  {
201  tw->printf_raw(" +DynHMM class : %s\n", "mootDynHMM (?)");
202  };
204 
205  //------------------------------------------------------------
206  // Tagging: Top-level
209 
213  virtual void tag_io(TokenReader *reader, TokenWriter *writer);
214 
219  virtual void tag_sentence(mootSentence &sentence);
221 };
222 
223 
224 /*======================================================================
225  * class mootDynLexHMM
226  */
228 class mootDynLexHMM : public mootDynHMM {
229 public:
230  //---------------------------------------------------------------------
231  // Types
232  typedef mootTokString TokStr;
233  typedef mootTagString TagStr;
235  typedef std::map<TokStr,ProbT> TokProbMap;
236  typedef std::map<TagStr,ProbT> TagProbMap;
237  typedef std::map<TagStr,TokProbMap> TagTokProbMap;
238  typedef std::map<TagStr,TagProbMap> TokTagProbMap;
240 public:
241  //---------------------------------------------------------------------
242  // Data
252  bool invert_lexp;
253 
254  TagStr newtag_str;
255  TagID newtag_id;
256  ProbT newtag_f;
258  TagTokProbMap Ftw;
259  TokProbMap Fw;
260  TokProbMap Ft;
261  ProbT Ftw_eps;
263  size_t tagids_size_orig;
265 public:
266  //---------------------------------------------------------------------
268 
270  : invert_lexp(true),
271  newtag_str("@NEW"),
272  newtag_id(0),
273  Ftw_eps(0.5),
274  tagids_size_orig(0)
275  {};
276 
277  virtual ~mootDynLexHMM(void) {};
278 
280  virtual void set_options(const mootDynHMMOptions &opts)
281  {
282  invert_lexp = opts.invert_lexp;
283  newtag_str = opts.newtag_str;
284  Ftw_eps = opts.Ftw_eps;
285  };
287 
288  //---------------------------------------------------------------------
290 
291 
292  virtual bool load_model(const string &modelname,
293  const mootTagString &start_tag_str="__$",
294  const char *myname="mootDynLexHMM::load_model()",
295  bool do_estimate_nglambdas=true,
296  bool do_estimate_wlambdas=true,
297  bool do_estimate_clambdas=true,
298  bool do_build_suffix_trie=true,
299  bool do_compute_logprobs=true);
300 
302  virtual bool compile(mootLexfreqs &lexfreqs,
303  mootNgrams &ngrams,
304  mootClassfreqs &classfreqs,
305  const mootTagString &start_tag_str="__$");
307 
308  //---------------------------------------------------------------------
310 
311  virtual void tag_hook_pre(mootSentence &sent);
312  virtual void tag_hook_post(mootSentence &sent);
314 
315  //---------------------------------------------------------------------
317 
318 
319  virtual void tw_put_info(moot::TokenWriter *tw);
321 
322  //---------------------------------------------------------------------
324 
325 
326  void dynlex_clear(void);
327 
337  virtual ProbT dynlex_analysis_freq(const mootToken &tok, const mootToken::Analysis &a)
338  {
339  return a.prob + Ftw_eps;
340  };
341 
346  virtual void dynlex_populate_lexprobs(void);
348 };
349 
350 /*======================================================================
351  * class mootDynLexHMM_Boltzmann
352  */
362 class mootDynLexHMM_Boltzmann : public mootDynLexHMM {
363 public:
368  ProbT dynlex_base;
369 
375  ProbT dynlex_beta;
376 
377 public:
380  : dynlex_base(2),
381  dynlex_beta(1)
382  {};
383 
385  virtual void set_options(const mootDynHMMOptions &opts)
386  {
387  mootDynLexHMM::set_options(opts);
388  dynlex_base = opts.dynlex_base;
389  dynlex_beta = opts.dynlex_beta;
390  };
396  virtual ProbT dynlex_analysis_freq(const mootToken &tok, const mootToken::Analysis &a)
397  {
398  return Ftw_eps + pow(dynlex_base, -dynlex_beta * a.prob);
399  };
400 
402  virtual void tw_put_info(moot::TokenWriter *tw)
403  {
404  mootDynLexHMM::tw_put_info(tw);
405  tw->printf_raw(" +DynHMM class : %s\n", "mootDynLexHMM_Boltzmann");
406  tw->printf_raw(" dynlex_base : %g\n", dynlex_base);
407  tw->printf_raw(" dynlex_beta : %g\n", dynlex_beta);
408  };
409 };
410 
411 
412 moot_END_NAMESPACE
413 
414 #endif /* _MOOT_DYNHMM_H */
Type for a single morphological analysis.
Definition: mootToken.h:106
~= "Boltzmann" ~= mootDynLexHMM_Boltzmann
Definition: mootDynHMM.h:50
1st-order Hidden Markov Model Tagger/Disambiguator class.
Definition: mootHMM.h:120
mootDynHMM(void)
Definition: mootDynHMM.h:111
Hidden Markov Model tagger/disambiguator.
virtual ~mootDynHMM(void)
Definition: mootDynHMM.h:115
bool text_tags
Definition: mootDynHMM.h:68
abstract HMM subclass for use with dynamic lexical probabilities.
Definition: mootDynHMM.h:97
mootEnumID TokID
Definition: mootHMM.h:130
~= "Freq" ~= mootDynLexHMM
Definition: mootDynHMM.h:49
virtual void printf_raw(const char *fmt,...)
Abstract class for token input.
Definition: mootTokenIO.h:208
class mootDynHMM * newDynHMM(const std::string &which="Freq", const mootDynHMMOptions &opts=mootDynHMMOptions())
unknown
Definition: mootDynHMM.h:48
Class for storage and retrieval of raw lexical-class frequencies.
Definition: mootClassfreqs.h:44
void tagset_resize(TagID tagid_max)
Definition: mootDynHMM.h:154
~= "MIParser" ~= mootMIParser
Definition: mootDynHMM.h:51
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
ProbT dynlex_base
Definition: mootDynHMM.h:64
Generic user-level options structure for built-in mootDynHMM subclasses.
Definition: mootDynHMM.h:56
ProbT Ftw_eps
Definition: mootDynHMM.h:61
High-level token information object.
Definition: mootToken.h:96
mootDynHMM subclass using a Maxwell-Boltzmann distribution to estimate f(w,t)
Definition: mootDynHMM.h:368
Class for storage and retrieval of raw lexical frequencies.
Definition: mootLexfreqs.h:44
placeholder
Definition: mootDynHMM.h:52
float ProbT
Definition: mootTypes.h:63
DynHMMClassId
Enum for built-in mootDynHMM estimator modes (subclasses)
Definition: mootDynHMM.h:47
void lex_resize(TokID tokid_max)
Definition: mootDynHMM.h:134
void lex_clear(void)
Definition: mootDynHMM.h:126
~mootDynHMMOptions(void)
Definition: mootDynHMM.h:80
list< mootToken > mootSentence
Definition: mootToken.h:630
bool invert_lexp
Definition: mootDynHMM.h:59
Abstract class for token output.
Definition: mootTokenIO.h:700
ProbT prob
Definition: mootToken.h:118
std::map< TagStr, TagProbMap > TokTagProbMap
Definition: mootDynHMM.h:244
string mootTagString
Definition: mootToken.h:59
TokID lex_get_tokid(const mootTokString &tok_text)
Definition: mootDynHMM.h:144
virtual void set_options(const mootDynHMMOptions &opts)
Definition: mootDynHMM.h:118
ProbT dynlex_beta
Definition: mootDynHMM.h:65
std::string newtag_str
Definition: mootDynHMM.h:60
mootDynHMMOptions(void)
Definition: mootDynHMM.h:71
string mootTokString
Definition: mootToken.h:62
mootEnumID TagID
Definition: mootHMM.h:127
mootDynHMM subclass for dynamic lexical probabilities
Definition: mootDynHMM.h:234