mootHMMTrainer.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2009 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*--------------------------------------------------------------------------
23  * File: mootHMMTrainer.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * + Trainer for moocow's PoS-tagger: shared headers
27  *--------------------------------------------------------------------------*/
28 
34 #ifndef _moot_HMM_TRAINER_H_
35 #define _moot_HMM_TRAINER_H_
36 
37 #include <mootTokenIO.h>
38 #include <mootNgrams.h>
39 //#include <mootLexfreqs.h> //-- included by mootClassfreqs.h
40 #include <mootClassfreqs.h>
41 #include <mootFlavor.h>
42 
43 moot_BEGIN_NAMESPACE
44 
45 using namespace std;
46 
47 /*--------------------------------------------------------------------------
48  * mootHMMTrainer : HMM trainer class
49  *--------------------------------------------------------------------------*/
50 
52 class mootHMMTrainer {
53 
54 public:
55  /*-------------------------------------------------------------*/
57 
58 
60 
63 
65  typedef set<mootTagString> TagSet;
66 
67 public:
68  /*-------------------------------------------------------------*/
70 
71 
73  mootNgrams ngrams;
74 
76  mootLexfreqs lexfreqs;
77 
79  mootClassfreqs lcfreqs;
80 
82  mootTaster taster;
84 
85  /*-------------------------------------------------------------*/
87 
88  bool want_ngrams;
89  bool want_lexfreqs;
90  bool want_classfreqs;
91  bool want_flavors;
93 
94  /*-------------------------------------------------------------*/
96 
97 
98  mootTagString eos_tag;
100 
101 protected:
102  /*------------------------------------------------------------*/
104 
105 
106  Ngram ng;
107 
109  bool last_was_eos;
111 
112 public:
113  /*------------------------------------------------------------*/
115 
116 
117  mootHMMTrainer(void)
118  : want_ngrams(true),
119  want_lexfreqs(true),
120  want_classfreqs(true),
121  want_flavors(true),
122  eos_tag("__$"),
123  last_was_eos(false)
124  {};
125 
127  ~mootHMMTrainer(void) {};
129 
130 
131  /*------------------------------------------------------------*/
133 
134 
135  inline void clear(void)
136  {
137  lexfreqs.clear();
138  ngrams.clear();
139  lcfreqs.clear();
140  taster.set_default_rules();
141  };
143 
144  /*------------------------------------------------------------*/
146 
147 
148  bool train_from_reader(TokenReader *reader);
149 
151  bool train_from_stream(FILE *in=stdin, const string &srcname="(unknown)");
152 
154  bool train_from_file(const string &filename);
155 
157  bool train_finish(void);
159 
160  /*------------------------------------------------------------*/
162 
163 
164  void train_init(void);
165 
167  void train_bos(void);
168 
170  void train_token(const mootToken &curtok);
171 
173  void train_eos(void);
175 
176  /*------------------------------------------------------------*/
178 
179 
181  void carp(const char *fmt, ...);
182 
184 };
185 
186 moot_END_NAMESPACE
187 
188 #endif /* _moot_HMM_TRAINER_H */
High-level heuristic token classifier .
Definition: mootFlavor.h:62
HMM training data: lexical-class frequencies: raw.
set< mootTagString > TagSet
Definition: mootHMMTrainer.h:62
Abstract class for token input.
Definition: mootTokenIO.h:208
classes and utilities for regex-based token "flavor" heuristics
CountT NgramCount
Definition: mootNgrams.h:50
void clear(void)
Definition: mootNgrams.h:158
Class for storage and retrieval of raw lexical-class frequencies.
Definition: mootClassfreqs.h:44
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
High-level token information object.
Definition: mootToken.h:96
Class for storage and retrieval of raw lexical frequencies.
Definition: mootLexfreqs.h:44
High-level class to gather training data for a mootHMM or mootCHMM.
Definition: mootHMMTrainer.h:49
void set_default_rules(void)
Abstract and native classes for I/O of moot::mootToken objects.
string mootTagString
Definition: mootToken.h:59
ProbT CountT
Definition: mootTypes.h:67
Definition: mootNgrams.h:80
HMM training data: n-gram frequencies: raw.