Main Page | Directories | File List

mootTrieVector.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2005 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: TrieVector.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : tries
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef MOOT_TRIE_VECTOR_H
00030 #define MOOT_TRIE_VECTOR_H
00031 
00032 #undef MOOT_TRIE_VECTOR_DEBUG
00033 
00034 #ifdef MOOT_TRIE_VECTOR_DEBUG
00035 # include <stdio.h>
00036 # include <stdlib.h>
00037 #endif
00038 
00039 #include <ctype.h>
00040 #include <vector>
00041 #include <string>
00042 #include <map>
00043 
00044 namespace moot {
00045 
00046 //======================================================================
00047 // TrieVectorNodeBase
00048 struct TrieVectorNodeBase
00049 {
00050   size_t mother;  
00051   size_t mindtr;  
00052 
00053   TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
00054     : mother(mother_index), mindtr(mindtr_index)
00055   {};
00056 };
00057 
00058 //======================================================================
00059 // TrieVectorNode
00061 template <typename DataT, typename CharT = char, typename UCharT = unsigned char>
00062 struct TrieVectorNode : public TrieVectorNodeBase
00063 {
00064   typedef DataT                                          data_type;
00065   typedef CharT                                          char_type;
00066   typedef UCharT                                         uchar_type;
00067   typedef TrieVectorNode<data_type,char_type,uchar_type>  node_type;
00068 
00069   CharT   label; 
00070   UCharT  ndtrs; 
00071   DataT   data;  
00072 
00073   TrieVectorNode(size_t mother_index=0,
00074                 size_t mindtr_index=0,
00075                 CharT  node_label=0,
00076                 UCharT node_ndtrs=0)
00077     : TrieVectorNodeBase(mother_index, mindtr_index),
00078       label(node_label),
00079       ndtrs(node_ndtrs)
00080   {};
00081 
00082   TrieVectorNode(size_t mother_index,
00083                 size_t mindtr_index,
00084                 CharT  node_label,
00085                 UCharT node_ndtrs,
00086                 const DataT  &node_data)
00087     : TrieVectorNodeBase(mother_index, mindtr_index),
00088       label(node_label),
00089       ndtrs(node_ndtrs),
00090       data(node_data)
00091   {};
00092 
00093   inline bool operator< (const TrieVectorNode &x) const
00094   { return mother < x.mother || label < x.label; };
00095 
00096   inline bool operator<= (const TrieVectorNode &x) const
00097   { return mother <= x.mother || label <= x.label; };
00098 
00099   inline bool operator== (const TrieVectorNode &x) const
00100   { return mother==x.mother && label==x.label; };
00101 };
00102 
00103 
00104 //======================================================================
00105 // TrieVectorBase
00107 class TrieVectorBase {
00108 public:
00110   typedef size_t NodeId;
00111 
00112 public:
00114   static const NodeId NoNode   = (size_t)-1;
00115 
00117   static const size_t NoMaxLen = (size_t)-1;
00118 
00119 public:
00120   size_t   trie_maxlen;        
00121   bool     trie_use_case;      
00122 
00123 public:
00124   TrieVectorBase(size_t maxlen=NoMaxLen, bool use_case=false)
00125     : trie_maxlen(maxlen),
00126       trie_use_case(use_case)
00127   {};
00128 
00129 }; //-- /TrieVectorBase
00130 
00131 //======================================================================
00132 // TrieVector
00133 
00146 template<class DataT, typename CharT = char, typename UCharT = unsigned char>
00147 class TrieVector
00148   : public TrieVectorBase,
00149     public std::vector<TrieVectorNode<DataT,CharT,UCharT> >
00150 {
00151 public:
00152   //--------------------------------------------------------------------
00153   // TrieVector: Types
00154   typedef DataT                              data_type;
00155   typedef CharT                              char_type;
00156   typedef UCharT                             uchar_type;
00157 
00158   typedef
00159      TrieVector<data_type,char_type,uchar_type>
00160      trie_type;
00161 
00162   typedef
00163      TrieVectorNode<data_type,char_type,uchar_type>
00164      node_type;
00165 
00166   typedef std::vector<node_type>                   vector_type;
00167 
00168   typedef typename vector_type::iterator           iterator;
00169   typedef typename vector_type::const_iterator     const_iterator;
00170 
00171   //-- moo: gcc-3.4 don't like this
00172   //typedef std::string<char_type>                     string_type;
00173   typedef std::string                                  string_type;
00174 
00175   typedef typename string_type::iterator               string_iterator;
00176   typedef typename string_type::const_iterator         const_string_iterator;
00177   typedef typename string_type::reverse_iterator       reverse_string_iterator;
00178   typedef typename string_type::const_reverse_iterator const_reverse_string_iterator;
00179 
00180   typedef std::map<string_type,NodeId>                 map_type;
00181   typedef typename map_type::iterator                  map_iterator;
00182   typedef typename map_type::const_iterator            const_map_iterator;
00183 
00184 public:
00185   //--------------------------------------------------------------------
00186   // TrieVector: Data
00187   map_type  trie_pending;       
00188   data_type trie_default_data;  
00189 
00190 public:
00191   //--------------------------------------------------------------------
00192   // TrieVector: Methods
00193 
00194   //--------------------------------------------------
00196 
00197 
00198   TrieVector(size_t max_len=NoMaxLen, bool use_case=false)
00199     : TrieVectorBase(max_len,use_case)
00200   {};
00201 
00203   ~TrieVector(void) {};
00204 
00206   inline void clear(void)
00207   {
00208     vector_type::clear();
00209     trie_pending.clear();
00210     //vector_type::push_back(node_type(0,0,0,0)); //-- add root: (mom,mindtr,label,ndtrs)
00211   };
00213 
00214   //--------------------------------------------------
00216 
00217 
00218   inline const size_t &maxlen(void) const
00219   { return trie_maxlen; };
00220 
00222   inline size_t &maxlen(void)
00223   { return trie_maxlen; };
00224 
00226   inline bool compiled(void) const
00227   { return !trie_pending.empty(); };
00228 
00230   inline void ensure_compiled(void)
00231   { if (!compiled()) compile(); };
00232 
00234   inline const DataT &default_data(void) const
00235   { return trie_default_data; };
00236 
00238   inline DataT &default_data(void)
00239   { return trie_default_data; };
00241 
00242   //--------------------------------------------------
00244 
00245 
00247   inline string_type trie_canonicalize(string_type &s) const
00248   {
00249     if (!trie_use_case) {
00250       for (string_iterator si = s.begin(); si != s.end(); si++) {
00251         *si = tolower(*si);
00252       }
00253     }
00254     return s;
00255   };
00256 
00258   inline void trie_key(const string_type &s,
00259                        const size_t max_len,
00260                        string_type &dst)
00261     const
00262   {
00263     dst.assign(s,0,max_len);
00264     trie_canonicalize(dst);
00265   };
00266 
00268   inline string_type trie_key(const string_type &s, const size_t max_len)
00269     const
00270   { 
00271     string_type key;
00272     trie_key(s,max_len,key);
00273     return key;
00274   };
00275 
00277   inline string_type trie_key(const string_type &s) const
00278   { return trie_key(s,trie_maxlen); };
00279 
00280 
00282   inline void trie_rkey(const string_type &s,
00283                         const size_t max_len,
00284                         string_type &dst)
00285     const
00286   {
00287     dst.assign(s.rbegin(), s.rbegin() + (max_len > s.size() ?
00288                                          s.size() :
00289                                          max_len));
00290     trie_canonicalize(dst);
00291   };
00292 
00294   inline string_type trie_rkey(const string_type &s, size_t max_len) const
00295   {
00296     string_type key;
00297     trie_rkey(s, max_len, key);
00298     return key;
00299   };
00300   
00302   inline string_type trie_rkey(const string_type &s) const
00303   { return trie_rkey(s,trie_maxlen); };
00305 
00306   //--------------------------------------------------
00308 
00309 
00310   inline void trie_insert(const string_type &s, size_t max_len)
00311   { trie_pending[trie_key(s,max_len)] = 0; };
00312 
00314   inline void trie_insert(const string_type &s)
00315   { trie_pending[trie_key(s,trie_maxlen)] = 0; };
00316 
00318   inline void trie_rinsert(const string_type &s, size_t max_len)
00319   { trie_pending[trie_rkey(s,max_len)] = 0; };
00320 
00322   inline void trie_rinsert(const string_type &s)
00323   { trie_pending[trie_rkey(s,trie_maxlen)] = 0; };
00325 
00326 
00327   //--------------------------------------------------
00329 
00330 
00339   inline iterator find_dtr(const node_type &from, CharT label)
00340   {
00341     UCharT    dn;
00342     iterator  di;
00343     if (!trie_use_case) label = tolower(label);
00344     for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
00345       if (di->label == label) return di;
00346     }
00347     return this->end();
00348   };
00349 
00351   inline const_iterator find_dtr(const node_type &from, CharT label) const
00352   {
00353     UCharT         dn;
00354     const_iterator di;
00355     if (!trie_use_case) label = tolower(label);
00356     for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
00357       if (di->label == label) return di;
00358     }
00359     return this->end();
00360   };
00361 
00363   inline NodeId find_dtr_id(NodeId fromid, CharT label) const
00364   {
00365     const_iterator di = find_dtr(*(this->begin()+fromid), label);
00366     return (di==this->end() ? NoNode : (di-this->begin()));
00367   };
00368 
00369 
00379   inline iterator first_dtr(const node_type &from)
00380   { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
00381 
00383   inline const_iterator first_dtr(const node_type &from) const
00384   { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
00385 
00395   inline iterator find_mother(const node_type &to)
00396   { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
00397 
00399   inline const_iterator find_mother(const node_type &to) const
00400   { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
00401 
00403   inline NodeId find_mother_id(NodeId toid) const
00404   { return (this->begin()+toid)->mother; };
00405 
00407   inline string_type node_rstring(const node_type &node) const
00408   {
00409     if (node.mother == NoNode) return string_type();
00410     string_type s(1, node.label);
00411     const_iterator mi;
00412     for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00413       s.push_back(mi->label);
00414     }
00415     return s;
00416   };
00417 
00419   inline string_type node_rstring(NodeId nodeid) const
00420   { return node_rstring(*(this->begin()+nodeid)); };
00421 
00423   inline string_type node_string(const node_type &node) const
00424   {
00425     string_type s = node_rstring(node);
00426     reverse(s.begin(),s.end());
00427     return s;
00428   };
00429 
00431   inline string_type node_string(NodeId nodeid) const
00432   { return node_string(*(this->begin()+nodeid)); };
00433 
00434 
00436   inline size_t node_depth(const node_type &node) const
00437   {
00438     size_t         depth = 0;
00439     const_iterator mi;
00440     for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00441       ++depth;
00442     }
00443     return depth;
00444   };
00445 
00447   inline size_t node_depth(NodeId nodeid) const
00448   { return node_depth(*(this->begin()+nodeid)); };
00450 
00451 
00452   //--------------------------------------------------
00454 
00455 
00456   inline void compile(void)
00457   {
00458     vector_type::clear();
00459     //-- add root: (mom,mindtr,label,ndtrs,data)
00460     push_back(node_type(NoNode,NoNode,0,0,trie_default_data));
00461 
00462     map_iterator       pi;
00463     size_t             pos;
00464     bool               changed;
00465     char_type          dlabel;
00466     NodeId             dnodid;
00467 
00468     //-- foreach character position @pos
00469     for (pos=0, changed=true; pos < trie_maxlen && changed; pos++) {
00470       changed = false;
00471 
00472       //-- foreach pair *pi = (pending-key,node)
00473       for (pi=trie_pending.begin(); pi != trie_pending.end(); pi++) {
00474         const string_type &kstr   = pi->first;
00475         NodeId            &knodid = pi->second;
00476         if (kstr.size() <= pos) continue;           //-- we've exhausted this string
00477 
00478         dlabel           = kstr[pos];                  //-- get daughter-label
00479         dnodid           = find_dtr_id(knodid,dlabel); //-- check for extant daughter
00480 
00481         if (dnodid == NoNode) {                     //-- Ye Olde Guttes: add a daughter
00482           dnodid = vector_type::size();
00483 
00484           //reserve(dnodid);
00485           push_back(node_type(knodid,               // (mom,
00486                               NoNode,               //  , mindtr
00487                               dlabel,               //  , label
00488                               0,                    //  , ndtrs
00489                               trie_default_data));  //  , data)
00490 
00491           node_type &mnode = this->operator[](knodid);    //-- get mother-node
00492           ++mnode.ndtrs;                            //-- update num/dtrs for mom
00493 
00494           if (mnode.mindtr == NoNode)
00495             mnode.mindtr = dnodid;                  //-- update min-dtr  for mom
00496           
00497           changed = true;
00498         }
00499 
00500         knodid = dnodid;                            //-- update "current" node in pending map
00501       }
00502     }
00503     //-- all pending arcs have been added: clear 'em
00504     trie_pending.clear();
00505   };
00507 
00508 
00509   //--------------------------------------------------
00511 
00512 
00522   inline iterator find_longest(const string_type &s,
00523                                size_t *matchlen=NULL)
00524   {
00525     const_string_iterator si;
00526     iterator              di, i = this->begin();
00527     size_t                pos;
00528 
00529     for (si  = s.begin() ,  di  = i        , pos=0;
00530          si != s.end()                    && pos < trie_maxlen;
00531          si++            ,   i  = di       , pos++)
00532       {
00533         di = find_dtr(*di,*si);
00534         if (di==this->end()) break;
00535       }
00536     if (matchlen) *matchlen = pos;
00537     return i;
00538   };
00539 
00541   inline const_iterator find_longest(const string_type &s,
00542                                      size_t *matchlen=NULL)
00543     const
00544   {
00545     const_string_iterator si;
00546     const_iterator        di, i = this->begin();
00547     size_t                pos;
00548 
00549     for (si  = s.begin() ,  di  = i        , pos=0;
00550          si != s.end()                    && pos < trie_maxlen;
00551          si++            ,   i  = di       , pos++)
00552       {
00553         di = find_dtr(*di,*si);
00554         if (di==this->end()) break;
00555       }
00556     if (matchlen) *matchlen = pos;
00557     return i;
00558   };
00559 
00565   inline iterator rfind_longest(const string_type &s,
00566                                 size_t *matchlen=NULL)
00567   {
00568     const_reverse_string_iterator si;
00569     iterator                      di, i = this->begin();
00570     size_t                        pos;
00571 
00572     for (si  = s.rbegin()  ,  di  = i        , pos=0;
00573          si != s.rend()                     && pos < trie_maxlen;
00574          si++              ,   i  = di       , pos++)
00575       {
00576         di = find_dtr(*di,*si);
00577         if (di==this->end()) break;
00578       }
00579     if (matchlen) *matchlen = pos;
00580     return i;
00581   };
00582 
00584   inline const_iterator rfind_longest(const string_type &s,
00585                                       size_t *matchlen=NULL)
00586     const
00587   {
00588     const_reverse_string_iterator si;
00589     const_iterator                di, i = this->begin();
00590     size_t                        pos;
00591 
00592     for (si  = s.rbegin()  ,  di  = i        , pos=0;
00593          si != s.rend()                     && pos < trie_maxlen;
00594          si++              ,   i  = di       , pos++)
00595       {
00596         di = find_dtr(*di,*si);
00597         if (di==this->end()) break;
00598       }
00599     if (matchlen) *matchlen = pos;
00600     return i;
00601   };
00603 
00604 #ifdef MOOT_TRIE_VECTOR_DEBUG
00605   //--------------------------------------------------
00607 
00608 
00611   void dump(FILE *out, const CharT delim=0)
00612   {
00613     const_iterator   i, mi;
00614     for (i = this->begin(); i != this->end(); i++) {
00615       string_type s = node_rstring(*i);
00616       if (s.empty()) continue;
00617       s.push_back(delim);
00618       fwrite(s.data(), sizeof(CharT), s.size(), out);
00619     }
00620   };
00621 
00623   void bindump(FILE *out) {
00624     for (const_iterator i=this->begin(); i != this->end(); i++) {
00625       fwrite(&(*i), sizeof(node_type), 1, out);
00626     }
00627   };
00628 
00630   void arcdump(FILE *out) {
00631     for (const_iterator i=this->begin(); i != this->end(); i++) {
00632       fprintf(out,"node=%u\t mom=%u\t mindtr=%u\t label=%c\t ndtrs=%u\n",
00633               i-this->begin(), i->mother, i->mindtr, i->label, i->ndtrs);
00634     }
00635   };
00637 #endif //-- /MOOT_TRIE_VECTOR_DEBUG
00638 };
00639 
00640 }; //-- /namespace moot
00641 
00642 #endif //-- MOOT_TRIE_VECTOR_H

Generated on Sat Sep 17 01:20:34 2005 for libmoot by  doxygen 1.4.4