mootTrieVector.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2007 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*--------------------------------------------------------------------------
23  * File: TrieVector.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * + moocow's PoS tagger : tries
27  *--------------------------------------------------------------------------*/
28 
34 #ifndef MOOT_TRIE_VECTOR_H
35 #define MOOT_TRIE_VECTOR_H
36 
37 #include <mootConfig.h>
38 
39 #undef MOOT_TRIE_VECTOR_DEBUG
40 
41 #ifdef MOOT_TRIE_VECTOR_DEBUG
42 # include <stdio.h>
43 # include <stdlib.h>
44 #endif
45 
46 #include <ctype.h>
47 #include <vector>
48 #include <string>
49 #include <map>
50 
51 namespace moot {
52 
53 //======================================================================
54 // TrieVectorNodeBase
55 struct TrieVectorNodeBase
56 {
57  size_t mother;
58  size_t mindtr;
59 
60  TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
61  : mother(mother_index), mindtr(mindtr_index)
62  {};
63 };
64 
65 //======================================================================
66 // TrieVectorNode
68 template <typename DataT, typename CharT = char, typename UCharT = unsigned char>
70 {
71  typedef DataT data_type;
72  typedef CharT char_type;
73  typedef UCharT uchar_type;
75 
76  CharT label;
77  UCharT ndtrs;
78  DataT data;
79 
80  TrieVectorNode(size_t mother_index=0,
81  size_t mindtr_index=0,
82  CharT node_label=0,
83  UCharT node_ndtrs=0)
84  : TrieVectorNodeBase(mother_index, mindtr_index),
85  label(node_label),
86  ndtrs(node_ndtrs)
87  {};
88 
89  TrieVectorNode(size_t mother_index,
90  size_t mindtr_index,
91  CharT node_label,
92  UCharT node_ndtrs,
93  const DataT &node_data)
94  : TrieVectorNodeBase(mother_index, mindtr_index),
95  label(node_label),
96  ndtrs(node_ndtrs),
97  data(node_data)
98  {};
99 
100  inline bool operator< (const TrieVectorNode &x) const
101  { return mother < x.mother || label < x.label; };
102 
103  inline bool operator<= (const TrieVectorNode &x) const
104  { return mother <= x.mother || label <= x.label; };
105 
106  inline bool operator== (const TrieVectorNode &x) const
107  { return mother==x.mother && label==x.label; };
108 };
109 
111 //======================================================================
112 // TrieVectorBase
114 class TrieVectorBase {
115 public:
117  typedef size_t NodeId;
118 
119 public:
121  static const NodeId NoNode = static_cast<size_t>(-1);
122 
124  static const size_t NoMaxLen = static_cast<size_t>(-1);
125 
126 public:
127  size_t trie_maxlen;
128  bool trie_use_case;
129 
130 public:
131  TrieVectorBase(size_t maxlen=NoMaxLen, bool use_case=false)
132  : trie_maxlen(maxlen),
133  trie_use_case(use_case)
134  {};
135 
136 }; //-- /TrieVectorBase
137 
138 //======================================================================
139 // TrieVector
140 
153 template<class DataT, typename CharT = char, typename UCharT = unsigned char>
154 class TrieVector
155  : public TrieVectorBase,
156  public std::vector<TrieVectorNode<DataT,CharT,UCharT> >
157 {
158 public:
159  //--------------------------------------------------------------------
160  // TrieVector: Types
161  typedef DataT data_type;
162  typedef CharT char_type;
163  typedef UCharT uchar_type;
165  typedef
167  trie_type;
168 
169  typedef
173  typedef std::vector<node_type> vector_type;
175  typedef typename vector_type::iterator iterator;
176  typedef typename vector_type::const_iterator const_iterator;
178  //-- moo: gcc-3.4 don't like this
179  //typedef std::string<char_type> string_type;
180  typedef std::string string_type;
181 
182  typedef typename string_type::iterator string_iterator;
183  typedef typename string_type::const_iterator const_string_iterator;
184  typedef typename string_type::reverse_iterator reverse_string_iterator;
185  typedef typename string_type::const_reverse_iterator const_reverse_string_iterator;
186 
187  typedef std::map<string_type,NodeId> map_type;
188  typedef typename map_type::iterator map_iterator;
189  typedef typename map_type::const_iterator const_map_iterator;
190 
191 public:
192  //--------------------------------------------------------------------
193  // TrieVector: Data
194  map_type trie_pending;
195  data_type trie_default_data;
196 
197 public:
198  //--------------------------------------------------------------------
199  // TrieVector: Methods
200 
201  //--------------------------------------------------
203 
204 
205  TrieVector(size_t max_len=NoMaxLen, bool use_case=false)
206  : TrieVectorBase(max_len,use_case),
207  vector_type(),
208  trie_pending(map_type()),
209  trie_default_data(data_type())
210  {};
211 
213  ~TrieVector(void) {};
214 
216  inline void clear(void)
217  {
218  vector_type::clear();
219  trie_pending.clear();
220  //vector_type::push_back(node_type(0,0,0,0)); //-- add root: (mom,mindtr,label,ndtrs)
221  };
223 
224  //--------------------------------------------------
226 
227 
228  inline const size_t &maxlen(void) const
229  { return trie_maxlen; };
230 
232  inline size_t &maxlen(void)
233  { return trie_maxlen; };
234 
236  inline bool compiled(void) const
237  { return !trie_pending.empty(); };
238 
240  inline void ensure_compiled(void)
241  { if (!compiled()) compile(); };
242 
244  inline const DataT &default_data(void) const
245  { return trie_default_data; };
248  inline DataT &default_data(void)
249  { return trie_default_data; };
251 
252  //--------------------------------------------------
254 
255 
257  inline string_type trie_canonicalize(string_type &s) const
258  {
259  if (!trie_use_case) {
260  for (string_iterator si = s.begin(); si != s.end(); si++) {
261  *si = tolower(*si);
262  }
263  }
264  return s;
265  };
266 
268  inline void trie_key(const string_type &s,
269  const size_t max_len,
270  string_type &dst)
271  const
272  {
273  dst.assign(s,0,max_len);
274  trie_canonicalize(dst);
275  };
278  inline string_type trie_key(const string_type &s, const size_t max_len)
279  const
280  {
281  string_type key;
282  trie_key(s,max_len,key);
283  return key;
284  };
285 
287  inline string_type trie_key(const string_type &s) const
288  { return trie_key(s,trie_maxlen); };
289 
290 
292  inline void trie_rkey(const string_type &s,
293  const size_t max_len,
294  string_type &dst)
295  const
296  {
297  dst.assign(s.rbegin(), s.rbegin() + (max_len > s.size() ?
298  s.size() :
299  max_len));
300  trie_canonicalize(dst);
301  };
302 
304  inline string_type trie_rkey(const string_type &s, size_t max_len) const
305  {
306  string_type key;
307  trie_rkey(s, max_len, key);
308  return key;
309  };
310 
312  inline string_type trie_rkey(const string_type &s) const
313  { return trie_rkey(s,trie_maxlen); };
315 
316  //--------------------------------------------------
318 
319 
320  inline void trie_insert(const string_type &s, size_t max_len)
321  { trie_pending[trie_key(s,max_len)] = 0; };
322 
324  inline void trie_insert(const string_type &s)
325  { trie_pending[trie_key(s,trie_maxlen)] = 0; };
326 
328  inline void trie_rinsert(const string_type &s, size_t max_len)
329  { trie_pending[trie_rkey(s,max_len)] = 0; };
330 
332  inline void trie_rinsert(const string_type &s)
333  { trie_pending[trie_rkey(s,trie_maxlen)] = 0; };
335 
336 
337  //--------------------------------------------------
339 
340 
349  inline iterator find_dtr(const node_type &from, CharT label)
350  {
351  UCharT dn;
352  iterator di;
353  if (!trie_use_case) label = tolower(label);
354  for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
355  if (di->label == label) return di;
356  }
357  return this->end();
358  };
359 
361  inline const_iterator find_dtr(const node_type &from, CharT label) const
362  {
363  UCharT dn;
364  const_iterator di;
365  if (!trie_use_case) label = tolower(label);
366  for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
367  if (di->label == label) return di;
368  }
369  return this->end();
370  };
371 
373  inline NodeId find_dtr_id(NodeId fromid, CharT label) const
374  {
375  const_iterator di = find_dtr(*(this->begin()+fromid), label);
376  return (di==this->end() ? NoNode : (di-this->begin()));
377  };
379 
389  inline iterator first_dtr(const node_type &from)
390  { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
391 
393  inline const_iterator first_dtr(const node_type &from) const
394  { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
395 
405  inline iterator find_mother(const node_type &to)
406  { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
407 
409  inline const_iterator find_mother(const node_type &to) const
410  { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
411 
413  inline NodeId find_mother_id(NodeId toid) const
414  { return (this->begin()+toid)->mother; };
415 
417  inline string_type node_rstring(const node_type &node) const
418  {
419  if (node.mother == NoNode) return string_type();
420  string_type s(1, node.label);
421  const_iterator mi;
422  for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
423  s.push_back(mi->label);
424  }
425  return s;
426  };
427 
429  inline string_type node_rstring(NodeId nodeid) const
430  { return node_rstring(*(this->begin()+nodeid)); };
431 
433  inline string_type node_string(const node_type &node) const
434  {
435  string_type s = node_rstring(node);
436  reverse(s.begin(),s.end());
437  return s;
438  };
439 
441  inline string_type node_string(NodeId nodeid) const
442  { return node_string(*(this->begin()+nodeid)); };
443 
444 
446  inline size_t node_depth(const node_type &node) const
447  {
448  size_t depth = 0;
449  const_iterator mi;
450  for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
451  ++depth;
452  }
453  return depth;
454  };
457  inline size_t node_depth(NodeId nodeid) const
458  { return node_depth(*(this->begin()+nodeid)); };
460 
461 
462  //--------------------------------------------------
464 
465 
466  inline void compile(void)
467  {
468  vector_type::clear();
469  //-- add root: (mom,mindtr,label,ndtrs,data)
470  this->push_back(node_type(NoNode,NoNode,0,0,trie_default_data));
471 
472  map_iterator pi;
473  size_t pos;
474  bool changed;
475  char_type dlabel;
476  NodeId dnodid;
477 
478  //-- foreach character position @pos
479  for (pos=0, changed=true; pos < trie_maxlen && changed; pos++) {
480  changed = false;
481 
482  //-- foreach pair *pi = (pending-key,node)
483  for (pi=trie_pending.begin(); pi != trie_pending.end(); pi++) {
484  const string_type &kstr = pi->first;
485  NodeId &knodid = pi->second;
486  if (kstr.size() <= pos) continue; //-- we've exhausted this string
487 
488  dlabel = kstr[pos]; //-- get daughter-label
489  dnodid = find_dtr_id(knodid,dlabel); //-- check for extant daughter
490 
491  if (dnodid == NoNode) { //-- Ye Olde Guttes: add a daughter
492  dnodid = vector_type::size();
493 
494  //reserve(dnodid);
495  this->push_back(node_type(knodid, // (mom,
496  NoNode, // , mindtr
497  dlabel, // , label
498  0, // , ndtrs
499  trie_default_data)); // , data)
500 
501  node_type &mnode = this->operator[](knodid); //-- get mother-node
502  ++mnode.ndtrs; //-- update num/dtrs for mom
503 
504  if (mnode.mindtr == NoNode)
505  mnode.mindtr = dnodid; //-- update min-dtr for mom
506 
507  changed = true;
508  }
509 
510  knodid = dnodid; //-- update "current" node in pending map
511  }
512  }
513  //-- all pending arcs have been added: clear 'em
514  trie_pending.clear();
515  };
517 
518 
519  //--------------------------------------------------
522 
532  inline iterator find_longest(const string_type &s,
533  size_t *matchlen=NULL)
534  {
535  const_string_iterator si;
536  iterator di, i = this->begin();
537  size_t pos;
538 
539  for (si = s.begin() , di = i , pos=0;
540  si != s.end() && pos < trie_maxlen;
541  si++ , i = di , pos++)
542  {
543  di = find_dtr(*di,*si);
544  if (di==this->end()) break;
545  }
546  if (matchlen) *matchlen = pos;
547  return i;
548  };
549 
551  inline const_iterator find_longest(const string_type &s,
552  size_t *matchlen=NULL)
553  const
554  {
555  const_string_iterator si;
556  const_iterator di, i = this->begin();
557  size_t pos;
558 
559  for (si = s.begin() , di = i , pos=0;
560  si != s.end() && pos < trie_maxlen;
561  si++ , i = di , pos++)
562  {
563  di = find_dtr(*di,*si);
564  if (di==this->end()) break;
565  }
566  if (matchlen) *matchlen = pos;
567  return i;
568  };
569 
575  inline iterator rfind_longest(const string_type &s,
576  size_t *matchlen=NULL)
577  {
578  const_reverse_string_iterator si;
579  iterator di, i = this->begin();
580  size_t pos;
581 
582  for (si = s.rbegin() , di = i , pos=0;
583  si != s.rend() && pos < trie_maxlen;
584  si++ , i = di , pos++)
585  {
586  di = find_dtr(*di,*si);
587  if (di==this->end()) break;
588  }
589  if (matchlen) *matchlen = pos;
590  return i;
591  };
592 
594  inline const_iterator rfind_longest(const string_type &s,
595  size_t *matchlen=NULL)
596  const
597  {
598  const_reverse_string_iterator si;
599  const_iterator di, i = this->begin();
600  size_t pos;
601 
602  for (si = s.rbegin() , di = i , pos=0;
603  si != s.rend() && pos < trie_maxlen;
604  si++ , i = di , pos++)
605  {
606  di = find_dtr(*di,*si);
607  if (di==this->end()) break;
608  }
609  if (matchlen) *matchlen = pos;
610  return i;
611  };
613 
614 #ifdef MOOT_TRIE_VECTOR_DEBUG
615  //--------------------------------------------------
617 
618 
621  void dump(FILE *out, const CharT delim=0)
622  {
623  const_iterator i, mi;
624  for (i = this->begin(); i != this->end(); i++) {
625  string_type s = node_rstring(*i);
626  if (s.empty()) continue;
627  s.push_back(delim);
628  fwrite(s.data(), sizeof(CharT), s.size(), out);
629  }
630  };
631 
633  void bindump(FILE *out) {
634  for (const_iterator i=this->begin(); i != this->end(); i++) {
635  fwrite(&(*i), sizeof(node_type), 1, out);
636  }
637  };
638 
640  void arcdump(FILE *out) {
641  for (const_iterator i=this->begin(); i != this->end(); i++) {
642  fprintf(out,"node=%u\t mom=%u\t mindtr=%u\t label=%c\t ndtrs=%u\n",
643  i-this->begin(), i->mother, i->mindtr, i->label, i->ndtrs);
644  }
645  };
647 #endif //-- /MOOT_TRIE_VECTOR_DEBUG
648 };
649 
650 }; //-- /namespace moot
651 
652 #endif //-- MOOT_TRIE_VECTOR_H
UCharT ndtrs
number of daughters
Definition: mootTrieVector.h:66
string_type::iterator string_iterator
Definition: mootTrieVector.h:171
Definition: mootAssocVector.h:39
CharT label
label of arc to this node
Definition: mootTrieVector.h:65
string_type::const_reverse_iterator const_reverse_string_iterator
Definition: mootTrieVector.h:174
Top-level trie class-template using an adjaceny table.
Definition: mootTrieVector.h:143
string_type::reverse_iterator reverse_string_iterator
Definition: mootTrieVector.h:173
safely includes autoheader preprocessor macros
Base class for TrieVector.
Definition: mootTrieVector.h:103
vector_type::iterator iterator
Definition: mootTrieVector.h:164
size_t NodeId
Definition: mootTrieVector.h:106
TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
Definition: mootTrieVector.h:49
LISP-style assoc list using vector<>: map-like class with small memory footprint. Useful for small as...
Definition: mootAssocVector.h:130
Definition: mootTrieVector.h:58
std::map< string_type, NodeId > map_type
Definition: mootTrieVector.h:176
vector_type::const_iterator const_iterator
Definition: mootTrieVector.h:165
std::string string_type
Definition: mootTrieVector.h:169
string_type::const_iterator const_string_iterator
Definition: mootTrieVector.h:172
Definition: mootTrieVector.h:44
UCharT uchar_type
Definition: mootTrieVector.h:62
unsigned char uchar_type
Definition: mootTrieVector.h:152
CharT char_type
Definition: mootTrieVector.h:61
char char_type
Definition: mootTrieVector.h:151
std::vector< node_type > vector_type
Definition: mootTrieVector.h:162
map_type::const_iterator const_map_iterator
Definition: mootTrieVector.h:178
DataT data_type
Definition: mootTrieVector.h:60
size_t mindtr
index of first arc from this node
Definition: mootTrieVector.h:47
map_type::iterator map_iterator
Definition: mootTrieVector.h:177
size_t mother
index of this node&#39;s mother
Definition: mootTrieVector.h:46
size_t node_depth(NodeId nodeid) const
Definition: mootTrieVector.h:446