mootNgrams.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2009 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*============================================================================
23  * File: mootNgrams.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * Class for storage & retrieval of N-Gram counts
27  *============================================================================*/
28 
34 #ifndef _moot_NGRAMS_H
35 #define _moot_NGRAMS_H
36 
37 #include <mootToken.h>
38 
39 moot_BEGIN_NAMESPACE
40 
44 class mootNgrams {
45 
46 public:
47  //------ public typedefs
48 
50  typedef CountT NgramCount;
51 
53  typedef map<mootTagString,NgramCount> TrigramTable;
54 
56  class BigramEntry {
57  public:
59  TrigramTable freqs;
60  public:
61  BigramEntry(const CountT bg_count=0) : count(bg_count) {};
62  };
63 
65  typedef map<mootTagString,BigramEntry> BigramTable;
66 
68  class UnigramEntry {
69  public:
71  BigramTable freqs;
72  public:
73  UnigramEntry(const CountT ug_count=0) : count(ug_count) {};
74  };
75 
77  typedef map<mootTagString,UnigramEntry> NgramTable;
78 
80  class Ngram : public deque<mootTagString> {
81  public:
83  Ngram(void) {};
85  Ngram(const mootTagString &tag1) {
86  push_back(tag1);
87  };
89  Ngram(const mootTagString &tag1, const mootTagString &tag2) {
90  push_back(tag1);
91  push_back(tag2);
92  };
94  Ngram(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3) {
95  push_back(tag1);
96  push_back(tag2);
97  push_back(tag3);
98  };
99 
101  ~Ngram(void) {
102  clear();
103  };
104 
105  /*----------------
106  * Accessors
107  */
109  const mootTagString &tag1(void) const { return (*this)[0]; } ;
111  const mootTagString &tag2(void) const { return (*this)[1]; };
113  const mootTagString &tag3(void) const { return (*this)[2]; };
114 
115  /*----------------
116  * Manipulators
117  */
119  void push(const mootTagString &tag_new=mootTagString("")) {
120  if (size() >= 3) pop_front();
121  push_back(tag_new);
122  };
123 
125  string as_string(void) const {
126  string s = "<";
127  for (const_iterator i = begin(); i != end(); i++) {
128  s.append(*i);
129  s.push_back(',');
130  }
131  if (s.size() > 1) {
132  s[s.size()-1] = '>';
133  } else {
134  s.push_back('>');
135  }
136  return s;
137  };
138  };
139 
140 public:
141  //------ public data
142  NgramTable ngtable;
143  NgramCount ugtotal;
145 public:
146  //------ public methods: constructurs etc.
148  mootNgrams(void) : ugtotal(0) {};
149 
152  clear();
153  };
154 
155  //------ public methods: manipulation
156 
158  void clear(void) {
159  ngtable.clear();
160  ugtotal = 0;
161  };
162 
163  //------ public methods: information
164 
166  inline size_t n_unigrams(void) const
167  { return ngtable.size(); };
168 
170  size_t n_bigrams(void) const;
171 
173  size_t n_trigrams(void) const;
174 
175 
176  //------ public methods: counting
180  inline void add_count(const mootTagString &tag, const NgramCount count)
181  {
182  ngtable[tag].count += count;
183  ugtotal += count;
184  };
185 
190  inline void add_count(const mootTagString &tag1, const mootTagString &tag2, const NgramCount count)
191  {
192  ngtable[tag1].freqs[tag2].count += count;
193  };
194 
199  inline void add_count(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3, const NgramCount count)
200  {
201  ngtable[tag1].freqs[tag2].freqs[tag3] += count;
202  };
203 
208  inline void add_count(const Ngram &ngram, const NgramCount count)
209  {
210  switch (ngram.size()) {
211  case 0:
212  break;
213  case 1:
214  add_count(ngram[0],count);
215  break;
216  case 2:
217  add_count(ngram[0],ngram[1],count);
218  break;
219  case 3:
220  add_count(ngram[0],ngram[1],ngram[2],count);
221  break;
222  default:
223  //-- max: bash to trigrams
224  add_count(ngram[ngram.size()-3],ngram[ngram.size()-2],ngram[ngram.size()-1],count);
225  break;
226  }
227  };
228 
229 
237  inline void add_counts(const Ngram &ngram, const NgramCount count)
238  {
239  size_t ngsize = ngram.size();
240  if (ngsize < 1) return;
241 
242  NgramTable::iterator ngi1 = ngtable.find(ngram.tag1());
243  if (ngi1 == ngtable.end()) {
244  ngi1 = ngtable.insert(pair<mootTagString,UnigramEntry>(ngram.tag1(),UnigramEntry())).first;
245  }
246  ngi1->second.count += count;
247  ugtotal += count;
248 
249  if (ngsize < 2) return;
250  BigramTable::iterator ngi2 = ngi1->second.freqs.find(ngram.tag2());
251  if (ngi2 == ngi1->second.freqs.end()) {
252  ngi2 = ngi1->second.freqs.insert(pair<mootTagString,
253  BigramEntry> (ngram.tag2(),BigramEntry())).first;
254  }
255  ngi2->second.count += count;
256 
257  if (ngsize < 3) return;
258  TrigramTable::iterator ngi3 = ngi2->second.freqs.find(ngram.tag3());
259  if (ngi3 == ngi2->second.freqs.end()) {
260  ngi2->second.freqs[ngram.tag3()] = count;
261  } else {
262  ngi3->second += count;
263  }
264  };
265 
266  //------ public methods: lookup
267 
269  inline const NgramCount lookup(const mootTagString &tag) const
270  {
271  NgramTable::const_iterator ugi = ngtable.find(tag);
272  return ugi == ngtable.end() ? 0 : ugi->second.count;
273  };
274 
276  inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
277  {
278  NgramTable::const_iterator ugi = ngtable.find(tag1);
279  if (ugi == ngtable.end()) return 0;
280  BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
281  return bgi == ugi->second.freqs.end() ? 0 : bgi->second.count;
282  };
283 
285  inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3) const
286  {
287  NgramTable::const_iterator ugi = ngtable.find(tag1);
288  if (ugi == ngtable.end()) return 0;
289  BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
290  if (bgi == ugi->second.freqs.end()) return 0;
291  TrigramTable::const_iterator tgi = bgi->second.freqs.find(tag3);
292  return tgi == bgi->second.freqs.end() ? 0 : tgi->second;
293  };
294 
295  //------ public methods: i/o
296 
298  bool load(const char *filename);
299 
301  bool load(FILE *file, const char *filename = NULL);
302 
304  bool save(const char *filename, bool compact=false);
305 
307  bool save(FILE *file, const char *filename = NULL, bool compact=false);
308 };
309 
310 moot_END_NAMESPACE
311 
312 #endif /* _moot_NGRAMS_H */
map< mootTagString, BigramEntry > BigramTable
Definition: mootNgrams.h:65
string as_string(void) const
Definition: mootNgrams.h:125
map< mootTagString, NgramCount > TrigramTable
Definition: mootNgrams.h:53
NgramCount ugtotal
Definition: mootNgrams.h:143
Ngram(const mootTagString &tag1)
Definition: mootNgrams.h:85
void add_count(const mootTagString &tag, const NgramCount count)
Definition: mootNgrams.h:180
BigramTable freqs
Definition: mootNgrams.h:71
CountT count
Definition: mootNgrams.h:58
classes and utilities for tokens and associated analyses
mootNgrams(void)
Definition: mootNgrams.h:148
CountT count
Definition: mootNgrams.h:70
NgramTable ngtable
Definition: mootNgrams.h:142
CountT NgramCount
Definition: mootNgrams.h:50
void clear(void)
Definition: mootNgrams.h:158
const mootTagString & tag2(void) const
Definition: mootNgrams.h:111
void push(const mootTagString &tag_new=mootTagString(""))
Definition: mootNgrams.h:119
Definition: mootNgrams.h:56
Definition: mootNgrams.h:68
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
Ngram(void)
Definition: mootNgrams.h:83
size_t n_unigrams(void) const
Definition: mootNgrams.h:166
TrigramTable freqs
Definition: mootNgrams.h:59
const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
Definition: mootNgrams.h:276
void add_count(const Ngram &ngram, const NgramCount count)
Definition: mootNgrams.h:208
BigramEntry(const CountT bg_count=0)
Definition: mootNgrams.h:61
map< mootTagString, UnigramEntry > NgramTable
Definition: mootNgrams.h:77
UnigramEntry(const CountT ug_count=0)
Definition: mootNgrams.h:73
string mootTagString
Definition: mootToken.h:59
ProbT CountT
Definition: mootTypes.h:67
~mootNgrams()
Definition: mootNgrams.h:151
Definition: mootNgrams.h:80
~Ngram(void)
Definition: mootNgrams.h:101
const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3) const
Definition: mootNgrams.h:285
Ngram(const mootTagString &tag1, const mootTagString &tag2)
Definition: mootNgrams.h:89
Ngram(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3)
Definition: mootNgrams.h:94
void add_count(const mootTagString &tag1, const mootTagString &tag2, const NgramCount count)
Definition: mootNgrams.h:190
const mootTagString & tag1(void) const
Definition: mootNgrams.h:109
const NgramCount lookup(const mootTagString &tag) const
Definition: mootNgrams.h:269
const mootTagString & tag3(void) const
Definition: mootNgrams.h:113
void add_count(const mootTagString &tag1, const mootTagString &tag2, const mootTagString &tag3, const NgramCount count)
Definition: mootNgrams.h:199
void add_counts(const Ngram &ngram, const NgramCount count)
Definition: mootNgrams.h:237