Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootTrieVector.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: TrieVector.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : tries
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef MOOT_TRIE_VECTOR_H
00030 #define MOOT_TRIE_VECTOR_H
00031 
00032 #undef MOOT_TRIE_VECTOR_DEBUG
00033 
00034 #ifdef MOOT_TRIE_VECTOR_DEBUG
00035 # include <stdio.h>
00036 # include <stdlib.h>
00037 #endif
00038 
00039 #include <ctype.h>
00040 #include <vector>
00041 #include <string>
00042 #include <map>
00043 
00044 namespace moot {
00045 
00046 //======================================================================
00047 // TrieVectorNodeBase
00048 struct TrieVectorNodeBase
00049 {
00050   size_t mother;  
00051   size_t mindtr;  
00052 
00053   TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
00054     : mother(mother_index), mindtr(mindtr_index)
00055   {};
00056 };
00057 
00058 //======================================================================
00059 // TrieVectorNode
00061 template <typename DataT, typename CharT = char, typename UCharT = unsigned char>
00062 struct TrieVectorNode : public TrieVectorNodeBase
00063 {
00064   typedef DataT                                          data_type;
00065   typedef CharT                                          char_type;
00066   typedef UCharT                                         uchar_type;
00067   typedef TrieVectorNode<data_type,char_type,uchar_type>  node_type;
00068 
00069   CharT   label; 
00070   UCharT  ndtrs; 
00071   DataT   data;  
00072 
00073   TrieVectorNode(size_t mother_index=0,
00074                 size_t mindtr_index=0,
00075                 CharT  node_label=0,
00076                 UCharT node_ndtrs=0)
00077     : TrieVectorNodeBase(mother_index, mindtr_index),
00078       label(node_label),
00079       ndtrs(node_ndtrs)
00080   {};
00081 
00082   TrieVectorNode(size_t mother_index,
00083                 size_t mindtr_index,
00084                 CharT  node_label,
00085                 UCharT node_ndtrs,
00086                 const DataT  &node_data)
00087     : TrieVectorNodeBase(mother_index, mindtr_index),
00088       label(node_label),
00089       ndtrs(node_ndtrs),
00090       data(node_data)
00091   {};
00092 
00093   inline bool operator< (const TrieVectorNode &x) const
00094   { return mother < x.mother || label < x.label; };
00095 
00096   inline bool operator<= (const TrieVectorNode &x) const
00097   { return mother <= x.mother || label <= x.label; };
00098 
00099   inline bool operator== (const TrieVectorNode &x) const
00100   { return mother==x.mother && label==x.label; };
00101 };
00102 
00103 
00104 //======================================================================
00105 // TrieVectorBase
00107 class TrieVectorBase {
00108 public:
00110   typedef size_t NodeId;
00111 
00112 public:
00114   static const NodeId NoNode   = (size_t)-1;
00115 
00117   static const size_t NoMaxLen = (size_t)-1;
00118 
00119 public:
00120   size_t   trie_maxlen;        
00121   bool     trie_use_case;      
00122 
00123 public:
00124   TrieVectorBase(size_t maxlen=NoMaxLen, bool use_case=false)
00125     : trie_maxlen(maxlen),
00126       trie_use_case(use_case)
00127   {};
00128 
00129 }; //-- /TrieVectorBase
00130 
00131 //======================================================================
00132 // TrieVector
00133 
00146 template<class DataT, typename CharT = char, typename UCharT = unsigned char>
00147 class TrieVector
00148   : public TrieVectorBase,
00149     public std::vector<TrieVectorNode<DataT,CharT,UCharT> >
00150 {
00151 public:
00152   //--------------------------------------------------------------------
00153   // TrieVector: Types
00154   typedef DataT                              data_type;
00155   typedef CharT                              char_type;
00156   typedef UCharT                             uchar_type;
00157 
00158   typedef
00159      TrieVector<data_type,char_type,uchar_type>
00160      trie_type;
00161 
00162   typedef
00163      TrieVectorNode<data_type,char_type,uchar_type>
00164      node_type;
00165 
00166   typedef std::vector<node_type>                   vector_type;
00167 
00168   typedef typename vector_type::iterator           iterator;
00169   typedef typename vector_type::const_iterator     const_iterator;
00170 
00171   typedef std::string<char_type>                       string_type;
00172   typedef typename string_type::iterator               string_iterator;
00173   typedef typename string_type::const_iterator         const_string_iterator;
00174   typedef typename string_type::reverse_iterator       reverse_string_iterator;
00175   typedef typename string_type::const_reverse_iterator const_reverse_string_iterator;
00176 
00177   typedef std::map<string_type,NodeId>                 map_type;
00178   typedef typename map_type::iterator                  map_iterator;
00179   typedef typename map_type::const_iterator            const_map_iterator;
00180 
00181 public:
00182   //--------------------------------------------------------------------
00183   // TrieVector: Data
00184   map_type  trie_pending;       
00185   data_type trie_default_data;  
00186 
00187 public:
00188   //--------------------------------------------------------------------
00189   // TrieVector: Methods
00190 
00191   //--------------------------------------------------
00193 
00194 
00195   TrieVector(size_t max_len=NoMaxLen, bool use_case=false)
00196     : TrieVectorBase(max_len,use_case)
00197   {};
00198 
00200   ~TrieVector(void) {};
00201 
00203   inline void clear(void)
00204   {
00205     vector_type::clear();
00206     trie_pending.clear();
00207     //vector_type::push_back(node_type(0,0,0,0)); //-- add root: (mom,mindtr,label,ndtrs)
00208   };
00210 
00211   //--------------------------------------------------
00213 
00214 
00215   inline const size_t &maxlen(void) const
00216   { return trie_maxlen; };
00217 
00219   inline size_t &maxlen(void)
00220   { return trie_maxlen; };
00221 
00223   inline bool compiled(void) const
00224   { return !trie_pending.empty(); };
00225 
00227   inline void ensure_compiled(void)
00228   { if (!compiled()) compile(); };
00229 
00231   inline const DataT &default_data(void) const
00232   { return trie_default_data; };
00233 
00235   inline DataT &default_data(void)
00236   { return trie_default_data; };
00238 
00239   //--------------------------------------------------
00241 
00242 
00244   inline string_type trie_canonicalize(string_type &s) const
00245   {
00246     if (!trie_use_case) {
00247       for (string_iterator si = s.begin(); si != s.end(); si++) {
00248         *si = tolower(*si);
00249       }
00250     }
00251     return s;
00252   };
00253 
00255   inline void trie_key(const string_type &s,
00256                        const size_t max_len,
00257                        string_type &dst)
00258     const
00259   {
00260     dst.assign(s,0,max_len);
00261     trie_canonicalize(dst);
00262   };
00263 
00265   inline string_type trie_key(const string_type &s, const size_t max_len)
00266     const
00267   { 
00268     string_type key;
00269     trie_key(s,max_len,key);
00270     return key;
00271   };
00272 
00274   inline string_type trie_key(const string_type &s) const
00275   { return trie_key(s,trie_maxlen); };
00276 
00277 
00279   inline void trie_rkey(const string_type &s,
00280                         const size_t max_len,
00281                         string_type &dst)
00282     const
00283   {
00284     dst.assign(s.rbegin(), s.rbegin() + (max_len > s.size() ?
00285                                          s.size() :
00286                                          max_len));
00287     trie_canonicalize(dst);
00288   };
00289 
00291   inline string_type trie_rkey(const string_type &s, size_t max_len) const
00292   {
00293     string_type key;
00294     trie_rkey(s, max_len, key);
00295     return key;
00296   };
00297   
00299   inline string_type trie_rkey(const string_type &s) const
00300   { return trie_rkey(s,trie_maxlen); };
00302 
00303   //--------------------------------------------------
00305 
00306 
00307   inline void trie_insert(const string_type &s, size_t max_len)
00308   { trie_pending[trie_key(s,max_len)] = 0; };
00309 
00311   inline void trie_insert(const string_type &s)
00312   { trie_pending[trie_key(s,trie_maxlen)] = 0; };
00313 
00315   inline void trie_rinsert(const string_type &s, size_t max_len)
00316   { trie_pending[trie_rkey(s,max_len)] = 0; };
00317 
00319   inline void trie_rinsert(const string_type &s)
00320   { trie_pending[trie_rkey(s,trie_maxlen)] = 0; };
00322 
00323 
00324   //--------------------------------------------------
00326 
00327 
00336   inline iterator find_dtr(const node_type &from, CharT label)
00337   {
00338     UCharT    dn;
00339     iterator  di;
00340     if (!trie_use_case) label = tolower(label);
00341     for (dn=0, di=begin()+from.mindtr; di != end() && dn < from.ndtrs; di++, dn++) {
00342       if (di->label == label) return di;
00343     }
00344     return end();
00345   };
00346 
00348   inline const_iterator find_dtr(const node_type &from, CharT label) const
00349   {
00350     UCharT         dn;
00351     const_iterator di;
00352     if (!trie_use_case) label = tolower(label);
00353     for (dn=0, di=begin()+from.mindtr; di != end() && dn < from.ndtrs; di++, dn++) {
00354       if (di->label == label) return di;
00355     }
00356     return end();
00357   };
00358 
00360   inline NodeId find_dtr_id(NodeId fromid, CharT label) const
00361   {
00362     const_iterator di = find_dtr(*(begin()+fromid), label);
00363     return (di==end() ? NoNode : (di-begin()));
00364   };
00365 
00366 
00376   inline iterator first_dtr(const node_type &from)
00377   { return ( from.ndtrs == 0 ? end() : (begin()+from.mindtr) ); };
00378 
00380   inline const_iterator first_dtr(const node_type &from) const
00381   { return ( from.ndtrs == 0 ? end() : (begin()+from.mindtr) ); };
00382 
00392   inline iterator find_mother(const node_type &to)
00393   { return (to.mother == NoNode ? end() : (begin()+to.mother)); };
00394 
00396   inline const_iterator find_mother(const node_type &to) const
00397   { return (to.mother == NoNode ? end() : (begin()+to.mother)); };
00398 
00400   inline NodeId find_mother_id(NodeId toid) const
00401   { return (begin()+toid)->mother; };
00402 
00404   inline string_type node_rstring(const node_type &node) const
00405   {
00406     if (node.mother == NoNode) return string_type();
00407     string_type s(1, node.label);
00408     const_iterator mi;
00409     for (mi=find_mother(node); mi != end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00410       s.push_back(mi->label);
00411     }
00412     return s;
00413   };
00414 
00416   inline string_type node_rstring(NodeId nodeid) const
00417   { return node_rstring(*(begin()+nodeid)); };
00418 
00420   inline string_type node_string(const node_type &node) const
00421   {
00422     string_type s = node_rstring(node);
00423     reverse(s.begin(),s.end());
00424     return s;
00425   };
00426 
00428   inline string_type node_string(NodeId nodeid) const
00429   { return node_string(*(begin()+nodeid)); };
00430 
00431 
00433   inline size_t node_depth(const node_type &node) const
00434   {
00435     size_t         depth = 0;
00436     const_iterator mi;
00437     for (mi=find_mother(node); mi != end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00438       ++depth;
00439     }
00440     return depth;
00441   };
00442 
00444   inline size_t node_depth(NodeId nodeid) const
00445   { return node_depth(*(begin()+nodeid)); };
00447 
00448 
00449   //--------------------------------------------------
00451 
00452 
00453   inline void compile(void)
00454   {
00455     vector_type::clear();
00456     //-- add root: (mom,mindtr,label,ndtrs,data)
00457     push_back(node_type(NoNode,NoNode,0,0,trie_default_data));
00458 
00459     map_iterator       pi;
00460     size_t             pos;
00461     bool               changed;
00462     char_type          dlabel;
00463     NodeId             dnodid;
00464 
00465     //-- foreach character position @pos
00466     for (pos=0, changed=true; pos < trie_maxlen && changed; pos++) {
00467       changed = false;
00468 
00469       //-- foreach pair *pi = (pending-key,node)
00470       for (pi=trie_pending.begin(); pi != trie_pending.end(); pi++) {
00471         const string_type &kstr   = pi->first;
00472         NodeId            &knodid = pi->second;
00473         if (kstr.size() <= pos) continue;           //-- we've exhausted this string
00474 
00475         dlabel           = kstr[pos];                  //-- get daughter-label
00476         dnodid           = find_dtr_id(knodid,dlabel); //-- check for extant daughter
00477 
00478         if (dnodid == NoNode) {                     //-- Ye Olde Guttes: add a daughter
00479           dnodid = vector_type::size();
00480 
00481           //reserve(dnodid);
00482           push_back(node_type(knodid,               // (mom,
00483                               NoNode,               //  , mindtr
00484                               dlabel,               //  , label
00485                               0,                    //  , ndtrs
00486                               trie_default_data));  //  , data)
00487 
00488           node_type &mnode = operator[](knodid);    //-- get mother-node
00489           ++mnode.ndtrs;                            //-- update num/dtrs for mom
00490 
00491           if (mnode.mindtr == NoNode)
00492             mnode.mindtr = dnodid;                  //-- update min-dtr  for mom
00493           
00494           changed = true;
00495         }
00496 
00497         knodid = dnodid;                            //-- update "current" node in pending map
00498       }
00499     }
00500     //-- all pending arcs have been added: clear 'em
00501     trie_pending.clear();
00502   };
00504 
00505 
00506   //--------------------------------------------------
00508 
00509 
00519   inline iterator find_longest(const string_type &s,
00520                                size_t *matchlen=NULL)
00521   {
00522     const_string_iterator si;
00523     iterator              di, i = begin();
00524     size_t                pos;
00525 
00526     for (si  = s.begin() ,  di  = i        , pos=0;
00527          si != s.end()                    && pos < trie_maxlen;
00528          si++            ,   i  = di       , pos++)
00529       {
00530         di = find_dtr(*di,*si);
00531         if (di==end()) break;
00532       }
00533     if (matchlen) *matchlen = pos;
00534     return i;
00535   };
00536 
00538   inline const_iterator find_longest(const string_type &s,
00539                                      size_t *matchlen=NULL)
00540     const
00541   {
00542     const_string_iterator si;
00543     const_iterator        di, i = begin();
00544     size_t                pos;
00545 
00546     for (si  = s.begin() ,  di  = i        , pos=0;
00547          si != s.end()                    && pos < trie_maxlen;
00548          si++            ,   i  = di       , pos++)
00549       {
00550         di = find_dtr(*di,*si);
00551         if (di==end()) break;
00552       }
00553     if (matchlen) *matchlen = pos;
00554     return i;
00555   };
00556 
00562   inline iterator rfind_longest(const string_type &s,
00563                                 size_t *matchlen=NULL)
00564   {
00565     const_reverse_string_iterator si;
00566     iterator                      di, i = begin();
00567     size_t                        pos;
00568 
00569     for (si  = s.rbegin()  ,  di  = i        , pos=0;
00570          si != s.rend()                     && pos < trie_maxlen;
00571          si++              ,   i  = di       , pos++)
00572       {
00573         di = find_dtr(*di,*si);
00574         if (di==end()) break;
00575       }
00576     if (matchlen) *matchlen = pos;
00577     return i;
00578   };
00579 
00581   inline const_iterator rfind_longest(const string_type &s,
00582                                       size_t *matchlen=NULL)
00583     const
00584   {
00585     const_reverse_string_iterator si;
00586     const_iterator                di, i = begin();
00587     size_t                        pos;
00588 
00589     for (si  = s.rbegin()  ,  di  = i        , pos=0;
00590          si != s.rend()                     && pos < trie_maxlen;
00591          si++              ,   i  = di       , pos++)
00592       {
00593         di = find_dtr(*di,*si);
00594         if (di==end()) break;
00595       }
00596     if (matchlen) *matchlen = pos;
00597     return i;
00598   };
00600 
00601 #ifdef MOOT_TRIE_VECTOR_DEBUG
00602   //--------------------------------------------------
00604 
00605 
00608   void dump(FILE *out, const CharT delim=0)
00609   {
00610     const_iterator   i, mi;
00611     for (i = begin(); i != end(); i++) {
00612       string_type s = node_rstring(*i);
00613       if (s.empty()) continue;
00614       s.push_back(delim);
00615       fwrite(s.data(), sizeof(CharT), s.size(), out);
00616     }
00617   };
00618 
00620   void bindump(FILE *out) {
00621     for (const_iterator i=begin(); i != end(); i++) {
00622       fwrite(&(*i), sizeof(node_type), 1, out);
00623     }
00624   };
00625 
00627   void arcdump(FILE *out) {
00628     for (const_iterator i=begin(); i != end(); i++) {
00629       fprintf(out,"node=%u\t mom=%u\t mindtr=%u\t label=%c\t ndtrs=%u\n",
00630               i-begin(), i->mother, i->mindtr, i->label, i->ndtrs);
00631     }
00632   };
00634 #endif //-- /MOOT_TRIE_VECTOR_DEBUG
00635 };
00636 
00637 }; //-- /namespace moot
00638 
00639 #endif //-- MOOT_TRIE_VECTOR_H

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913