00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef MOOT_TRIE_VECTOR_H
00030 #define MOOT_TRIE_VECTOR_H
00031
00032 #undef MOOT_TRIE_VECTOR_DEBUG
00033
00034 #ifdef MOOT_TRIE_VECTOR_DEBUG
00035 # include <stdio.h>
00036 # include <stdlib.h>
00037 #endif
00038
00039 #include <ctype.h>
00040 #include <vector>
00041 #include <string>
00042 #include <map>
00043
00044 namespace moot {
00045
00046
00047
00048 struct TrieVectorNodeBase
00049 {
00050 size_t mother;
00051 size_t mindtr;
00052
00053 TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
00054 : mother(mother_index), mindtr(mindtr_index)
00055 {};
00056 };
00057
00058
00059
00061 template <typename DataT, typename CharT = char, typename UCharT = unsigned char>
00062 struct TrieVectorNode : public TrieVectorNodeBase
00063 {
00064 typedef DataT data_type;
00065 typedef CharT char_type;
00066 typedef UCharT uchar_type;
00067 typedef TrieVectorNode<data_type,char_type,uchar_type> node_type;
00068
00069 CharT label;
00070 UCharT ndtrs;
00071 DataT data;
00072
00073 TrieVectorNode(size_t mother_index=0,
00074 size_t mindtr_index=0,
00075 CharT node_label=0,
00076 UCharT node_ndtrs=0)
00077 : TrieVectorNodeBase(mother_index, mindtr_index),
00078 label(node_label),
00079 ndtrs(node_ndtrs)
00080 {};
00081
00082 TrieVectorNode(size_t mother_index,
00083 size_t mindtr_index,
00084 CharT node_label,
00085 UCharT node_ndtrs,
00086 const DataT &node_data)
00087 : TrieVectorNodeBase(mother_index, mindtr_index),
00088 label(node_label),
00089 ndtrs(node_ndtrs),
00090 data(node_data)
00091 {};
00092
00093 inline bool operator< (const TrieVectorNode &x) const
00094 { return mother < x.mother || label < x.label; };
00095
00096 inline bool operator<= (const TrieVectorNode &x) const
00097 { return mother <= x.mother || label <= x.label; };
00098
00099 inline bool operator== (const TrieVectorNode &x) const
00100 { return mother==x.mother && label==x.label; };
00101 };
00102
00103
00104
00105
00107 class TrieVectorBase {
00108 public:
00110 typedef size_t NodeId;
00111
00112 public:
00114 static const NodeId NoNode = (size_t)-1;
00115
00117 static const size_t NoMaxLen = (size_t)-1;
00118
00119 public:
00120 size_t trie_maxlen;
00121 bool trie_use_case;
00122
00123 public:
00124 TrieVectorBase(size_t maxlen=NoMaxLen, bool use_case=false)
00125 : trie_maxlen(maxlen),
00126 trie_use_case(use_case)
00127 {};
00128
00129 };
00130
00131
00132
00133
00146 template<class DataT, typename CharT = char, typename UCharT = unsigned char>
00147 class TrieVector
00148 : public TrieVectorBase,
00149 public std::vector<TrieVectorNode<DataT,CharT,UCharT> >
00150 {
00151 public:
00152
00153
00154 typedef DataT data_type;
00155 typedef CharT char_type;
00156 typedef UCharT uchar_type;
00157
00158 typedef
00159 TrieVector<data_type,char_type,uchar_type>
00160 trie_type;
00161
00162 typedef
00163 TrieVectorNode<data_type,char_type,uchar_type>
00164 node_type;
00165
00166 typedef std::vector<node_type> vector_type;
00167
00168 typedef typename vector_type::iterator iterator;
00169 typedef typename vector_type::const_iterator const_iterator;
00170
00171 typedef std::string<char_type> string_type;
00172 typedef typename string_type::iterator string_iterator;
00173 typedef typename string_type::const_iterator const_string_iterator;
00174 typedef typename string_type::reverse_iterator reverse_string_iterator;
00175 typedef typename string_type::const_reverse_iterator const_reverse_string_iterator;
00176
00177 typedef std::map<string_type,NodeId> map_type;
00178 typedef typename map_type::iterator map_iterator;
00179 typedef typename map_type::const_iterator const_map_iterator;
00180
00181 public:
00182
00183
00184 map_type trie_pending;
00185 data_type trie_default_data;
00186
00187 public:
00188
00189
00190
00191
00193
00194
00195 TrieVector(size_t max_len=NoMaxLen, bool use_case=false)
00196 : TrieVectorBase(max_len,use_case)
00197 {};
00198
00200 ~TrieVector(void) {};
00201
00203 inline void clear(void)
00204 {
00205 vector_type::clear();
00206 trie_pending.clear();
00207
00208 };
00210
00211
00213
00214
00215 inline const size_t &maxlen(void) const
00216 { return trie_maxlen; };
00217
00219 inline size_t &maxlen(void)
00220 { return trie_maxlen; };
00221
00223 inline bool compiled(void) const
00224 { return !trie_pending.empty(); };
00225
00227 inline void ensure_compiled(void)
00228 { if (!compiled()) compile(); };
00229
00231 inline const DataT &default_data(void) const
00232 { return trie_default_data; };
00233
00235 inline DataT &default_data(void)
00236 { return trie_default_data; };
00238
00239
00241
00242
00244 inline string_type trie_canonicalize(string_type &s) const
00245 {
00246 if (!trie_use_case) {
00247 for (string_iterator si = s.begin(); si != s.end(); si++) {
00248 *si = tolower(*si);
00249 }
00250 }
00251 return s;
00252 };
00253
00255 inline void trie_key(const string_type &s,
00256 const size_t max_len,
00257 string_type &dst)
00258 const
00259 {
00260 dst.assign(s,0,max_len);
00261 trie_canonicalize(dst);
00262 };
00263
00265 inline string_type trie_key(const string_type &s, const size_t max_len)
00266 const
00267 {
00268 string_type key;
00269 trie_key(s,max_len,key);
00270 return key;
00271 };
00272
00274 inline string_type trie_key(const string_type &s) const
00275 { return trie_key(s,trie_maxlen); };
00276
00277
00279 inline void trie_rkey(const string_type &s,
00280 const size_t max_len,
00281 string_type &dst)
00282 const
00283 {
00284 dst.assign(s.rbegin(), s.rbegin() + (max_len > s.size() ?
00285 s.size() :
00286 max_len));
00287 trie_canonicalize(dst);
00288 };
00289
00291 inline string_type trie_rkey(const string_type &s, size_t max_len) const
00292 {
00293 string_type key;
00294 trie_rkey(s, max_len, key);
00295 return key;
00296 };
00297
00299 inline string_type trie_rkey(const string_type &s) const
00300 { return trie_rkey(s,trie_maxlen); };
00302
00303
00305
00306
00307 inline void trie_insert(const string_type &s, size_t max_len)
00308 { trie_pending[trie_key(s,max_len)] = 0; };
00309
00311 inline void trie_insert(const string_type &s)
00312 { trie_pending[trie_key(s,trie_maxlen)] = 0; };
00313
00315 inline void trie_rinsert(const string_type &s, size_t max_len)
00316 { trie_pending[trie_rkey(s,max_len)] = 0; };
00317
00319 inline void trie_rinsert(const string_type &s)
00320 { trie_pending[trie_rkey(s,trie_maxlen)] = 0; };
00322
00323
00324
00326
00327
00336 inline iterator find_dtr(const node_type &from, CharT label)
00337 {
00338 UCharT dn;
00339 iterator di;
00340 if (!trie_use_case) label = tolower(label);
00341 for (dn=0, di=begin()+from.mindtr; di != end() && dn < from.ndtrs; di++, dn++) {
00342 if (di->label == label) return di;
00343 }
00344 return end();
00345 };
00346
00348 inline const_iterator find_dtr(const node_type &from, CharT label) const
00349 {
00350 UCharT dn;
00351 const_iterator di;
00352 if (!trie_use_case) label = tolower(label);
00353 for (dn=0, di=begin()+from.mindtr; di != end() && dn < from.ndtrs; di++, dn++) {
00354 if (di->label == label) return di;
00355 }
00356 return end();
00357 };
00358
00360 inline NodeId find_dtr_id(NodeId fromid, CharT label) const
00361 {
00362 const_iterator di = find_dtr(*(begin()+fromid), label);
00363 return (di==end() ? NoNode : (di-begin()));
00364 };
00365
00366
00376 inline iterator first_dtr(const node_type &from)
00377 { return ( from.ndtrs == 0 ? end() : (begin()+from.mindtr) ); };
00378
00380 inline const_iterator first_dtr(const node_type &from) const
00381 { return ( from.ndtrs == 0 ? end() : (begin()+from.mindtr) ); };
00382
00392 inline iterator find_mother(const node_type &to)
00393 { return (to.mother == NoNode ? end() : (begin()+to.mother)); };
00394
00396 inline const_iterator find_mother(const node_type &to) const
00397 { return (to.mother == NoNode ? end() : (begin()+to.mother)); };
00398
00400 inline NodeId find_mother_id(NodeId toid) const
00401 { return (begin()+toid)->mother; };
00402
00404 inline string_type node_rstring(const node_type &node) const
00405 {
00406 if (node.mother == NoNode) return string_type();
00407 string_type s(1, node.label);
00408 const_iterator mi;
00409 for (mi=find_mother(node); mi != end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00410 s.push_back(mi->label);
00411 }
00412 return s;
00413 };
00414
00416 inline string_type node_rstring(NodeId nodeid) const
00417 { return node_rstring(*(begin()+nodeid)); };
00418
00420 inline string_type node_string(const node_type &node) const
00421 {
00422 string_type s = node_rstring(node);
00423 reverse(s.begin(),s.end());
00424 return s;
00425 };
00426
00428 inline string_type node_string(NodeId nodeid) const
00429 { return node_string(*(begin()+nodeid)); };
00430
00431
00433 inline size_t node_depth(const node_type &node) const
00434 {
00435 size_t depth = 0;
00436 const_iterator mi;
00437 for (mi=find_mother(node); mi != end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00438 ++depth;
00439 }
00440 return depth;
00441 };
00442
00444 inline size_t node_depth(NodeId nodeid) const
00445 { return node_depth(*(begin()+nodeid)); };
00447
00448
00449
00451
00452
00453 inline void compile(void)
00454 {
00455 vector_type::clear();
00456
00457 push_back(node_type(NoNode,NoNode,0,0,trie_default_data));
00458
00459 map_iterator pi;
00460 size_t pos;
00461 bool changed;
00462 char_type dlabel;
00463 NodeId dnodid;
00464
00465
00466 for (pos=0, changed=true; pos < trie_maxlen && changed; pos++) {
00467 changed = false;
00468
00469
00470 for (pi=trie_pending.begin(); pi != trie_pending.end(); pi++) {
00471 const string_type &kstr = pi->first;
00472 NodeId &knodid = pi->second;
00473 if (kstr.size() <= pos) continue;
00474
00475 dlabel = kstr[pos];
00476 dnodid = find_dtr_id(knodid,dlabel);
00477
00478 if (dnodid == NoNode) {
00479 dnodid = vector_type::size();
00480
00481
00482 push_back(node_type(knodid,
00483 NoNode,
00484 dlabel,
00485 0,
00486 trie_default_data));
00487
00488 node_type &mnode = operator[](knodid);
00489 ++mnode.ndtrs;
00490
00491 if (mnode.mindtr == NoNode)
00492 mnode.mindtr = dnodid;
00493
00494 changed = true;
00495 }
00496
00497 knodid = dnodid;
00498 }
00499 }
00500
00501 trie_pending.clear();
00502 };
00504
00505
00506
00508
00509
00519 inline iterator find_longest(const string_type &s,
00520 size_t *matchlen=NULL)
00521 {
00522 const_string_iterator si;
00523 iterator di, i = begin();
00524 size_t pos;
00525
00526 for (si = s.begin() , di = i , pos=0;
00527 si != s.end() && pos < trie_maxlen;
00528 si++ , i = di , pos++)
00529 {
00530 di = find_dtr(*di,*si);
00531 if (di==end()) break;
00532 }
00533 if (matchlen) *matchlen = pos;
00534 return i;
00535 };
00536
00538 inline const_iterator find_longest(const string_type &s,
00539 size_t *matchlen=NULL)
00540 const
00541 {
00542 const_string_iterator si;
00543 const_iterator di, i = begin();
00544 size_t pos;
00545
00546 for (si = s.begin() , di = i , pos=0;
00547 si != s.end() && pos < trie_maxlen;
00548 si++ , i = di , pos++)
00549 {
00550 di = find_dtr(*di,*si);
00551 if (di==end()) break;
00552 }
00553 if (matchlen) *matchlen = pos;
00554 return i;
00555 };
00556
00562 inline iterator rfind_longest(const string_type &s,
00563 size_t *matchlen=NULL)
00564 {
00565 const_reverse_string_iterator si;
00566 iterator di, i = begin();
00567 size_t pos;
00568
00569 for (si = s.rbegin() , di = i , pos=0;
00570 si != s.rend() && pos < trie_maxlen;
00571 si++ , i = di , pos++)
00572 {
00573 di = find_dtr(*di,*si);
00574 if (di==end()) break;
00575 }
00576 if (matchlen) *matchlen = pos;
00577 return i;
00578 };
00579
00581 inline const_iterator rfind_longest(const string_type &s,
00582 size_t *matchlen=NULL)
00583 const
00584 {
00585 const_reverse_string_iterator si;
00586 const_iterator di, i = begin();
00587 size_t pos;
00588
00589 for (si = s.rbegin() , di = i , pos=0;
00590 si != s.rend() && pos < trie_maxlen;
00591 si++ , i = di , pos++)
00592 {
00593 di = find_dtr(*di,*si);
00594 if (di==end()) break;
00595 }
00596 if (matchlen) *matchlen = pos;
00597 return i;
00598 };
00600
00601 #ifdef MOOT_TRIE_VECTOR_DEBUG
00602
00604
00605
00608 void dump(FILE *out, const CharT delim=0)
00609 {
00610 const_iterator i, mi;
00611 for (i = begin(); i != end(); i++) {
00612 string_type s = node_rstring(*i);
00613 if (s.empty()) continue;
00614 s.push_back(delim);
00615 fwrite(s.data(), sizeof(CharT), s.size(), out);
00616 }
00617 };
00618
00620 void bindump(FILE *out) {
00621 for (const_iterator i=begin(); i != end(); i++) {
00622 fwrite(&(*i), sizeof(node_type), 1, out);
00623 }
00624 };
00625
00627 void arcdump(FILE *out) {
00628 for (const_iterator i=begin(); i != end(); i++) {
00629 fprintf(out,"node=%u\t mom=%u\t mindtr=%u\t label=%c\t ndtrs=%u\n",
00630 i-begin(), i->mother, i->mindtr, i->label, i->ndtrs);
00631 }
00632 };
00634 #endif //-- /MOOT_TRIE_VECTOR_DEBUG
00635 };
00636
00637 };
00638
00639 #endif //-- MOOT_TRIE_VECTOR_H