00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef _moot_NGRAMS_H
00030 #define _moot_NGRAMS_H
00031
00032 #include <mootTypes.h>
00033 #include <mootToken.h>
00034
00035 moot_BEGIN_NAMESPACE
00036
00040 class mootNgrams {
00041
00042 public:
00043
00044
00046 typedef CountT NgramCount;
00047
00049 typedef map<mootTagString,NgramCount> TrigramTable;
00050
00052 class BigramEntry {
00053 public:
00054 CountT count;
00055 TrigramTable freqs;
00056 public:
00057 BigramEntry(const CountT bg_count=0) : count(bg_count) {};
00058 };
00059
00061 typedef map<mootTagString,BigramEntry> BigramTable;
00062
00064 class UnigramEntry {
00065 public:
00066 CountT count;
00067 BigramTable freqs;
00068 public:
00069 UnigramEntry(const CountT ug_count=0) : count(ug_count) {};
00070 };
00071
00073 typedef map<mootTagString,UnigramEntry> NgramTable;
00074
00076 class Ngram : public deque<mootTagString> {
00077 public:
00079 Ngram(void) {};
00081 Ngram(const mootTagString &tag1) {
00082 push_back(tag1);
00083 };
00085 Ngram(const mootTagString &tag1, const mootTagString &tag2) {
00086 push_back(tag1);
00087 push_back(tag2);
00088 };
00090 Ngram(const mootTagString &tag1,
00091 const mootTagString &tag2,
00092 const mootTagString &tag3) {
00093 push_back(tag1);
00094 push_back(tag2);
00095 push_back(tag3);
00096 };
00097
00099 ~Ngram(void) {
00100 clear();
00101 };
00102
00103
00104
00105
00107 const mootTagString &tag1(void) const { return (*this)[0]; } ;
00109 const mootTagString &tag2(void) const { return (*this)[1]; };
00111 const mootTagString &tag3(void) const { return (*this)[2]; };
00112
00113
00114
00115
00117 void push(const mootTagString &tag_new=mootTagString("")) {
00118 if (size() >= 3) pop_front();
00119 push_back(tag_new);
00120 };
00121
00123 string as_string(void) const {
00124 string s = "<";
00125 for (const_iterator i = begin(); i != end(); i++) {
00126 s.append(*i);
00127 s.push_back(',');
00128 }
00129 if (s.size() > 1) {
00130 s[s.size()-1] = '>';
00131 } else {
00132 s.push_back('>');
00133 }
00134 return s;
00135 };
00136 };
00137
00138 public:
00139
00140 NgramTable ngtable;
00141 NgramCount ugtotal;
00143 public:
00144
00146 mootNgrams(void) : ugtotal(0) {};
00147
00149 ~mootNgrams() {
00150 clear();
00151 };
00152
00153
00154
00156 void clear(void) {
00157 ngtable.clear();
00158 ugtotal = 0;
00159 };
00160
00161
00163 size_t n_bigrams(void);
00164
00166 size_t n_trigrams(void);
00167
00171 inline void add_count(const mootTagString &tag, const NgramCount count)
00172 {
00173 ngtable[tag].count += count;
00174 ugtotal += count;
00175 };
00176
00181 inline void add_count(const mootTagString &tag1,
00182 const mootTagString &tag2,
00183 const NgramCount count)
00184 {
00185 ngtable[tag1].freqs[tag2].count += count;
00186 };
00187
00192 inline void add_count(const mootTagString &tag1,
00193 const mootTagString &tag2,
00194 const mootTagString &tag3,
00195 const NgramCount count)
00196 {
00197 ngtable[tag1].freqs[tag2].freqs[tag3] += count;
00198 };
00199
00204 inline void add_count(const Ngram &ngram, const NgramCount count)
00205 {
00206 switch (ngram.size()) {
00207 case 0:
00208 break;
00209 case 1:
00210 add_count(ngram[0],count);
00211 break;
00212 case 2:
00213 add_count(ngram[0],ngram[1],count);
00214 break;
00215 case 3:
00216 add_count(ngram[0],ngram[1],ngram[2],count);
00217 break;
00218 default:
00219 break;
00220 }
00221 };
00222
00223
00231 inline void add_counts(const Ngram &ngram, const NgramCount count)
00232 {
00233 size_t ngsize = ngram.size();
00234 if (ngsize < 1) return;
00235
00236 NgramTable::iterator ngi1 = ngtable.find(ngram.tag1());
00237 if (ngi1 == ngtable.end()) {
00238 ngi1 = ngtable.insert(pair<mootTagString,UnigramEntry>(ngram.tag1(),UnigramEntry())).first;
00239 }
00240 ngi1->second.count += count;
00241 ugtotal += count;
00242
00243 if (ngsize < 2) return;
00244 BigramTable::iterator ngi2 = ngi1->second.freqs.find(ngram.tag2());
00245 if (ngi2 == ngi1->second.freqs.end()) {
00246 ngi2 = ngi1->second.freqs.insert(pair<mootTagString,
00247 BigramEntry> (ngram.tag2(),BigramEntry())).first;
00248 }
00249 ngi2->second.count += count;
00250
00251 if (ngsize < 3) return;
00252 TrigramTable::iterator ngi3 = ngi2->second.freqs.find(ngram.tag3());
00253 if (ngi3 == ngi2->second.freqs.end()) {
00254 ngi2->second.freqs[ngram.tag3()] = count;
00255 } else {
00256 ngi3->second += count;
00257 }
00258 };
00259
00260
00261
00263 inline const NgramCount lookup(const mootTagString &tag) const
00264 {
00265 NgramTable::const_iterator ugi = ngtable.find(tag);
00266 return ugi == ngtable.end() ? 0 : ugi->second.count;
00267 };
00268
00270 inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
00271 {
00272 NgramTable::const_iterator ugi = ngtable.find(tag1);
00273 if (ugi == ngtable.end()) return 0;
00274 BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00275 return bgi == ugi->second.freqs.end() ? 0 : bgi->second.count;
00276 };
00277
00279 inline const NgramCount lookup(const mootTagString &tag1,
00280 const mootTagString &tag2,
00281 const mootTagString &tag3)
00282 const
00283 {
00284 NgramTable::const_iterator ugi = ngtable.find(tag1);
00285 if (ugi == ngtable.end()) return 0;
00286 BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00287 if (bgi == ugi->second.freqs.end()) return 0;
00288 TrigramTable::const_iterator tgi = bgi->second.freqs.find(tag3);
00289 return tgi == bgi->second.freqs.end() ? 0 : tgi->second;
00290 };
00291
00292
00293
00295 bool load(const char *filename);
00296
00298 bool load(FILE *file, const char *filename = NULL);
00299
00301 bool save(const char *filename, bool compact=false);
00302
00304 bool save(FILE *file, const char *filename = NULL, bool compact=false);
00305 };
00306
00307 moot_END_NAMESPACE
00308
00309 #endif