Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootNgrams.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /*============================================================================
00023  * File: mootNgrams.h
00024  * Author:  Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *    Class for storage & retrieval of N-Gram counts
00027  *============================================================================*/
00028 
00029 #ifndef _moot_NGRAMS_H
00030 #define _moot_NGRAMS_H
00031 
00032 #include <mootTypes.h>
00033 
00034 moot_BEGIN_NAMESPACE
00035 
00039 class mootNgrams {
00040 
00041 public:
00042   //------ public typedefs
00043 
00045   typedef CountT NgramCount;
00046 
00048   typedef map<mootTagString,NgramCount> TrigramTable;
00049 
00051   class BigramEntry {
00052   public:
00053     CountT       count;   
00054     TrigramTable  freqs;   
00055   public:
00056     BigramEntry(const CountT bg_count=0) : count(bg_count) {};
00057   };
00058 
00060   typedef map<mootTagString,BigramEntry> BigramTable;
00061   
00063   class UnigramEntry {
00064   public:
00065     CountT       count;   
00066     BigramTable  freqs;   
00067   public:
00068     UnigramEntry(const CountT ug_count=0) : count(ug_count) {};
00069   };
00070 
00072   typedef map<mootTagString,UnigramEntry> NgramTable;
00073 
00075   class Ngram : public deque<mootTagString> {
00076   public:
00078     Ngram(void) {};
00080     Ngram(const mootTagString &tag1) {
00081       push_back(tag1);
00082     };
00084     Ngram(const mootTagString &tag1, const mootTagString &tag2) {
00085       push_back(tag1);
00086       push_back(tag2);
00087     };
00089     Ngram(const mootTagString &tag1,
00090           const mootTagString &tag2,
00091           const mootTagString &tag3) {
00092       push_back(tag1);
00093       push_back(tag2);
00094       push_back(tag3);
00095     };
00096 
00098     ~Ngram(void) {
00099       clear();
00100     };
00101 
00102     /*----------------
00103      * Accessors
00104      */
00106     const mootTagString &tag1(void) const { return (*this)[0]; } ;
00108     const mootTagString &tag2(void) const { return (*this)[1]; };
00110     const mootTagString &tag3(void) const { return (*this)[2]; };
00111 
00112     /*----------------
00113      * Manipulators
00114      */
00116     void push(const mootTagString &tag_new=mootTagString("")) {
00117       if (size() >= 3) pop_front();
00118       push_back(tag_new);
00119     };
00120 
00122     string as_string(void) const {
00123       string s = "<";
00124       for (const_iterator i = begin(); i != end(); i++) {
00125         s.append(*i);
00126         s.push_back(',');
00127       }
00128       if (s.size() > 1) {
00129         s[s.size()-1] = '>';
00130       } else {
00131         s.push_back('>');
00132       }
00133       return s;
00134     };
00135   };
00136 
00137 public:
00138   //------ public data
00139   NgramTable  ngtable;  
00140   NgramCount  ugtotal;  
00142 public:
00143   //------ public methods
00145   mootNgrams(void) : ugtotal(0) {};
00146 
00148   ~mootNgrams() {
00149     clear(); 
00150   };
00151 
00152   //------ public methods: manipulation
00153 
00155   void clear(void) {
00156     ngtable.clear();
00157     ugtotal = 0;
00158   };
00159 
00160   //------ public methods: information
00162   size_t n_bigrams(void);
00163 
00165   size_t n_trigrams(void);
00166 
00170   inline void add_count(const mootTagString &tag, const NgramCount count)
00171   {
00172     ngtable[tag].count += count;
00173     ugtotal += count;
00174   };
00175 
00180   inline void add_count(const mootTagString &tag1,
00181                         const mootTagString &tag2,
00182                         const NgramCount count)
00183   {
00184     ngtable[tag1].freqs[tag2].count += count;
00185   };
00186 
00191   inline void add_count(const mootTagString &tag1,
00192                         const mootTagString &tag2,
00193                         const mootTagString &tag3,
00194                         const NgramCount count)
00195   {
00196     ngtable[tag1].freqs[tag2].freqs[tag3] += count;
00197   };
00198 
00203   inline void add_count(const Ngram &ngram, const NgramCount count)
00204   {
00205     switch (ngram.size()) {
00206     case 0:
00207       break;
00208     case 1:
00209       add_count(ngram[0],count);
00210       break;
00211     case 2:
00212       add_count(ngram[0],ngram[1],count);
00213       break;
00214     case 3:
00215       add_count(ngram[0],ngram[1],ngram[2],count);
00216       break;
00217     default:
00218       break;
00219     }
00220   };
00221 
00222 
00230   inline void add_counts(const Ngram &ngram, const NgramCount count)
00231   {
00232     size_t ngsize = ngram.size();
00233     if (ngsize < 1) return;
00234 
00235     NgramTable::iterator ngi1 = ngtable.find(ngram.tag1());
00236     if (ngi1 == ngtable.end()) {
00237       ngi1 = ngtable.insert(pair<mootTagString,UnigramEntry>(ngram.tag1(),UnigramEntry())).first;
00238     }
00239     ngi1->second.count += count;
00240     ugtotal += count;
00241 
00242     if (ngsize < 2) return;
00243     BigramTable::iterator ngi2 = ngi1->second.freqs.find(ngram.tag2());
00244     if (ngi2 == ngi1->second.freqs.end()) {
00245       ngi2 = ngi1->second.freqs.insert(pair<mootTagString,
00246                                             BigramEntry>  (ngram.tag2(),BigramEntry())).first;
00247     }
00248     ngi2->second.count += count;
00249 
00250     if (ngsize < 3) return;
00251     TrigramTable::iterator ngi3 = ngi2->second.freqs.find(ngram.tag3());
00252     if (ngi3 == ngi2->second.freqs.end()) {
00253       ngi2->second.freqs[ngram.tag3()] = count;
00254     } else {
00255       ngi3->second += count;
00256     }
00257   };
00258 
00259   //------ public methods: lookup
00260 
00262   inline const NgramCount lookup(const mootTagString &tag) const
00263   {
00264     NgramTable::const_iterator ugi = ngtable.find(tag);
00265     return ugi == ngtable.end() ? 0 : ugi->second.count;
00266   };
00267 
00269   inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
00270   {
00271     NgramTable::const_iterator ugi = ngtable.find(tag1);
00272     if (ugi == ngtable.end()) return 0;
00273     BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00274     return bgi == ugi->second.freqs.end() ? 0 : bgi->second.count;
00275   };
00276 
00278   inline const NgramCount lookup(const mootTagString &tag1,
00279                                  const mootTagString &tag2,
00280                                  const mootTagString &tag3)
00281     const
00282   {
00283     NgramTable::const_iterator ugi = ngtable.find(tag1);
00284     if (ugi == ngtable.end()) return 0;
00285     BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00286     if (bgi == ugi->second.freqs.end()) return 0;
00287     TrigramTable::const_iterator tgi = bgi->second.freqs.find(tag3);
00288     return tgi == bgi->second.freqs.end() ? 0 : tgi->second;
00289   };
00290 
00291   //------ public methods: i/o
00292 
00294   bool load(const char *filename);
00295 
00297   bool load(FILE *file, const char *filename = NULL);
00298 
00300   bool save(const char *filename, bool compact=false);
00301 
00303   bool save(FILE *file, const char *filename = NULL, bool compact=false);
00304 };
00305 
00306 moot_END_NAMESPACE
00307 
00308 #endif /* _moot_NGRAMS_H */

Generated on Wed Jul 28 15:48:03 2004 for libmoot by doxygen1.2.15