00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _moot_NGRAMS_H
00030 #define _moot_NGRAMS_H
00031
00032 #include <mootTypes.h>
00033
00034 moot_BEGIN_NAMESPACE
00035
00039 class mootNgrams {
00040
00041 public:
00042
00043
00045 typedef CountT NgramCount;
00046
00048 typedef map<mootTagString,NgramCount> TrigramTable;
00049
00051 class BigramEntry {
00052 public:
00053 CountT count;
00054 TrigramTable freqs;
00055 public:
00056 BigramEntry(const CountT bg_count=0) : count(bg_count) {};
00057 };
00058
00060 typedef map<mootTagString,BigramEntry> BigramTable;
00061
00063 class UnigramEntry {
00064 public:
00065 CountT count;
00066 BigramTable freqs;
00067 public:
00068 UnigramEntry(const CountT ug_count=0) : count(ug_count) {};
00069 };
00070
00072 typedef map<mootTagString,UnigramEntry> NgramTable;
00073
00075 class Ngram : public deque<mootTagString> {
00076 public:
00078 Ngram(void) {};
00080 Ngram(const mootTagString &tag1) {
00081 push_back(tag1);
00082 };
00084 Ngram(const mootTagString &tag1, const mootTagString &tag2) {
00085 push_back(tag1);
00086 push_back(tag2);
00087 };
00089 Ngram(const mootTagString &tag1,
00090 const mootTagString &tag2,
00091 const mootTagString &tag3) {
00092 push_back(tag1);
00093 push_back(tag2);
00094 push_back(tag3);
00095 };
00096
00098 ~Ngram(void) {
00099 clear();
00100 };
00101
00102
00103
00104
00106 const mootTagString &tag1(void) const { return (*this)[0]; } ;
00108 const mootTagString &tag2(void) const { return (*this)[1]; };
00110 const mootTagString &tag3(void) const { return (*this)[2]; };
00111
00112
00113
00114
00116 void push(const mootTagString &tag_new=mootTagString("")) {
00117 if (size() >= 3) pop_front();
00118 push_back(tag_new);
00119 };
00120
00122 string as_string(void) const {
00123 string s = "<";
00124 for (const_iterator i = begin(); i != end(); i++) {
00125 s.append(*i);
00126 s.push_back(',');
00127 }
00128 if (s.size() > 1) {
00129 s[s.size()-1] = '>';
00130 } else {
00131 s.push_back('>');
00132 }
00133 return s;
00134 };
00135 };
00136
00137 public:
00138
00139 NgramTable ngtable;
00140 NgramCount ugtotal;
00142 public:
00143
00145 mootNgrams(void) : ugtotal(0) {};
00146
00148 ~mootNgrams() {
00149 clear();
00150 };
00151
00152
00153
00155 void clear(void) {
00156 ngtable.clear();
00157 ugtotal = 0;
00158 };
00159
00160
00162 size_t n_bigrams(void);
00163
00165 size_t n_trigrams(void);
00166
00170 inline void add_count(const mootTagString &tag, const NgramCount count)
00171 {
00172 ngtable[tag].count += count;
00173 ugtotal += count;
00174 };
00175
00180 inline void add_count(const mootTagString &tag1,
00181 const mootTagString &tag2,
00182 const NgramCount count)
00183 {
00184 ngtable[tag1].freqs[tag2].count += count;
00185 };
00186
00191 inline void add_count(const mootTagString &tag1,
00192 const mootTagString &tag2,
00193 const mootTagString &tag3,
00194 const NgramCount count)
00195 {
00196 ngtable[tag1].freqs[tag2].freqs[tag3] += count;
00197 };
00198
00203 inline void add_count(const Ngram &ngram, const NgramCount count)
00204 {
00205 switch (ngram.size()) {
00206 case 0:
00207 break;
00208 case 1:
00209 add_count(ngram[0],count);
00210 break;
00211 case 2:
00212 add_count(ngram[0],ngram[1],count);
00213 break;
00214 case 3:
00215 add_count(ngram[0],ngram[1],ngram[2],count);
00216 break;
00217 default:
00218 break;
00219 }
00220 };
00221
00222
00230 inline void add_counts(const Ngram &ngram, const NgramCount count)
00231 {
00232 size_t ngsize = ngram.size();
00233 if (ngsize < 1) return;
00234
00235 NgramTable::iterator ngi1 = ngtable.find(ngram.tag1());
00236 if (ngi1 == ngtable.end()) {
00237 ngi1 = ngtable.insert(pair<mootTagString,UnigramEntry>(ngram.tag1(),UnigramEntry())).first;
00238 }
00239 ngi1->second.count += count;
00240 ugtotal += count;
00241
00242 if (ngsize < 2) return;
00243 BigramTable::iterator ngi2 = ngi1->second.freqs.find(ngram.tag2());
00244 if (ngi2 == ngi1->second.freqs.end()) {
00245 ngi2 = ngi1->second.freqs.insert(pair<mootTagString,
00246 BigramEntry> (ngram.tag2(),BigramEntry())).first;
00247 }
00248 ngi2->second.count += count;
00249
00250 if (ngsize < 3) return;
00251 TrigramTable::iterator ngi3 = ngi2->second.freqs.find(ngram.tag3());
00252 if (ngi3 == ngi2->second.freqs.end()) {
00253 ngi2->second.freqs[ngram.tag3()] = count;
00254 } else {
00255 ngi3->second += count;
00256 }
00257 };
00258
00259
00260
00262 inline const NgramCount lookup(const mootTagString &tag) const
00263 {
00264 NgramTable::const_iterator ugi = ngtable.find(tag);
00265 return ugi == ngtable.end() ? 0 : ugi->second.count;
00266 };
00267
00269 inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
00270 {
00271 NgramTable::const_iterator ugi = ngtable.find(tag1);
00272 if (ugi == ngtable.end()) return 0;
00273 BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00274 return bgi == ugi->second.freqs.end() ? 0 : bgi->second.count;
00275 };
00276
00278 inline const NgramCount lookup(const mootTagString &tag1,
00279 const mootTagString &tag2,
00280 const mootTagString &tag3)
00281 const
00282 {
00283 NgramTable::const_iterator ugi = ngtable.find(tag1);
00284 if (ugi == ngtable.end()) return 0;
00285 BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00286 if (bgi == ugi->second.freqs.end()) return 0;
00287 TrigramTable::const_iterator tgi = bgi->second.freqs.find(tag3);
00288 return tgi == bgi->second.freqs.end() ? 0 : tgi->second;
00289 };
00290
00291
00292
00294 bool load(const char *filename);
00295
00297 bool load(FILE *file, const char *filename = NULL);
00298
00300 bool save(const char *filename, bool compact=false);
00301
00303 bool save(FILE *file, const char *filename = NULL, bool compact=false);
00304 };
00305
00306 moot_END_NAMESPACE
00307
00308 #endif