Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootNgrams.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2005 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*============================================================================
00023  * File: mootNgrams.h
00024  * Author:  Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *    Class for storage & retrieval of N-Gram counts
00027  *============================================================================*/
00028 
00029 #ifndef _moot_NGRAMS_H
00030 #define _moot_NGRAMS_H
00031 
00032 #include <mootTypes.h>
00033 #include <mootToken.h>
00034 
00035 moot_BEGIN_NAMESPACE
00036 
00040 class mootNgrams {
00041 
00042 public:
00043   //------ public typedefs
00044 
00046   typedef CountT NgramCount;
00047 
00049   typedef map<mootTagString,NgramCount> TrigramTable;
00050 
00052   class BigramEntry {
00053   public:
00054     CountT       count;   
00055     TrigramTable  freqs;   
00056   public:
00057     BigramEntry(const CountT bg_count=0) : count(bg_count) {};
00058   };
00059 
00061   typedef map<mootTagString,BigramEntry> BigramTable;
00062   
00064   class UnigramEntry {
00065   public:
00066     CountT       count;   
00067     BigramTable  freqs;   
00068   public:
00069     UnigramEntry(const CountT ug_count=0) : count(ug_count) {};
00070   };
00071 
00073   typedef map<mootTagString,UnigramEntry> NgramTable;
00074 
00076   class Ngram : public deque<mootTagString> {
00077   public:
00079     Ngram(void) {};
00081     Ngram(const mootTagString &tag1) {
00082       push_back(tag1);
00083     };
00085     Ngram(const mootTagString &tag1, const mootTagString &tag2) {
00086       push_back(tag1);
00087       push_back(tag2);
00088     };
00090     Ngram(const mootTagString &tag1,
00091           const mootTagString &tag2,
00092           const mootTagString &tag3) {
00093       push_back(tag1);
00094       push_back(tag2);
00095       push_back(tag3);
00096     };
00097 
00099     ~Ngram(void) {
00100       clear();
00101     };
00102 
00103     /*----------------
00104      * Accessors
00105      */
00107     const mootTagString &tag1(void) const { return (*this)[0]; } ;
00109     const mootTagString &tag2(void) const { return (*this)[1]; };
00111     const mootTagString &tag3(void) const { return (*this)[2]; };
00112 
00113     /*----------------
00114      * Manipulators
00115      */
00117     void push(const mootTagString &tag_new=mootTagString("")) {
00118       if (size() >= 3) pop_front();
00119       push_back(tag_new);
00120     };
00121 
00123     string as_string(void) const {
00124       string s = "<";
00125       for (const_iterator i = begin(); i != end(); i++) {
00126         s.append(*i);
00127         s.push_back(',');
00128       }
00129       if (s.size() > 1) {
00130         s[s.size()-1] = '>';
00131       } else {
00132         s.push_back('>');
00133       }
00134       return s;
00135     };
00136   };
00137 
00138 public:
00139   //------ public data
00140   NgramTable  ngtable;  
00141   NgramCount  ugtotal;  
00143 public:
00144   //------ public methods
00146   mootNgrams(void) : ugtotal(0) {};
00147 
00149   ~mootNgrams() {
00150     clear(); 
00151   };
00152 
00153   //------ public methods: manipulation
00154 
00156   void clear(void) {
00157     ngtable.clear();
00158     ugtotal = 0;
00159   };
00160 
00161   //------ public methods: information
00163   size_t n_bigrams(void);
00164 
00166   size_t n_trigrams(void);
00167 
00171   inline void add_count(const mootTagString &tag, const NgramCount count)
00172   {
00173     ngtable[tag].count += count;
00174     ugtotal += count;
00175   };
00176 
00181   inline void add_count(const mootTagString &tag1,
00182                         const mootTagString &tag2,
00183                         const NgramCount count)
00184   {
00185     ngtable[tag1].freqs[tag2].count += count;
00186   };
00187 
00192   inline void add_count(const mootTagString &tag1,
00193                         const mootTagString &tag2,
00194                         const mootTagString &tag3,
00195                         const NgramCount count)
00196   {
00197     ngtable[tag1].freqs[tag2].freqs[tag3] += count;
00198   };
00199 
00204   inline void add_count(const Ngram &ngram, const NgramCount count)
00205   {
00206     switch (ngram.size()) {
00207     case 0:
00208       break;
00209     case 1:
00210       add_count(ngram[0],count);
00211       break;
00212     case 2:
00213       add_count(ngram[0],ngram[1],count);
00214       break;
00215     case 3:
00216       add_count(ngram[0],ngram[1],ngram[2],count);
00217       break;
00218     default:
00219       break;
00220     }
00221   };
00222 
00223 
00231   inline void add_counts(const Ngram &ngram, const NgramCount count)
00232   {
00233     size_t ngsize = ngram.size();
00234     if (ngsize < 1) return;
00235 
00236     NgramTable::iterator ngi1 = ngtable.find(ngram.tag1());
00237     if (ngi1 == ngtable.end()) {
00238       ngi1 = ngtable.insert(pair<mootTagString,UnigramEntry>(ngram.tag1(),UnigramEntry())).first;
00239     }
00240     ngi1->second.count += count;
00241     ugtotal += count;
00242 
00243     if (ngsize < 2) return;
00244     BigramTable::iterator ngi2 = ngi1->second.freqs.find(ngram.tag2());
00245     if (ngi2 == ngi1->second.freqs.end()) {
00246       ngi2 = ngi1->second.freqs.insert(pair<mootTagString,
00247                                             BigramEntry>  (ngram.tag2(),BigramEntry())).first;
00248     }
00249     ngi2->second.count += count;
00250 
00251     if (ngsize < 3) return;
00252     TrigramTable::iterator ngi3 = ngi2->second.freqs.find(ngram.tag3());
00253     if (ngi3 == ngi2->second.freqs.end()) {
00254       ngi2->second.freqs[ngram.tag3()] = count;
00255     } else {
00256       ngi3->second += count;
00257     }
00258   };
00259 
00260   //------ public methods: lookup
00261 
00263   inline const NgramCount lookup(const mootTagString &tag) const
00264   {
00265     NgramTable::const_iterator ugi = ngtable.find(tag);
00266     return ugi == ngtable.end() ? 0 : ugi->second.count;
00267   };
00268 
00270   inline const NgramCount lookup(const mootTagString &tag1, const mootTagString &tag2) const
00271   {
00272     NgramTable::const_iterator ugi = ngtable.find(tag1);
00273     if (ugi == ngtable.end()) return 0;
00274     BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00275     return bgi == ugi->second.freqs.end() ? 0 : bgi->second.count;
00276   };
00277 
00279   inline const NgramCount lookup(const mootTagString &tag1,
00280                                  const mootTagString &tag2,
00281                                  const mootTagString &tag3)
00282     const
00283   {
00284     NgramTable::const_iterator ugi = ngtable.find(tag1);
00285     if (ugi == ngtable.end()) return 0;
00286     BigramTable::const_iterator bgi = ugi->second.freqs.find(tag2);
00287     if (bgi == ugi->second.freqs.end()) return 0;
00288     TrigramTable::const_iterator tgi = bgi->second.freqs.find(tag3);
00289     return tgi == bgi->second.freqs.end() ? 0 : tgi->second;
00290   };
00291 
00292   //------ public methods: i/o
00293 
00295   bool load(const char *filename);
00296 
00298   bool load(FILE *file, const char *filename = NULL);
00299 
00301   bool save(const char *filename, bool compact=false);
00302 
00304   bool save(FILE *file, const char *filename = NULL, bool compact=false);
00305 };
00306 
00307 moot_END_NAMESPACE
00308 
00309 #endif /* _moot_NGRAMS_H */

Generated on Mon Sep 11 16:10:33 2006 for libmoot by doxygen1.2.18