00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #ifndef MOOT_TRIE_VECTOR_H
00030 #define MOOT_TRIE_VECTOR_H
00031
00032 #undef MOOT_TRIE_VECTOR_DEBUG
00033
00034 #ifdef MOOT_TRIE_VECTOR_DEBUG
00035 # include <stdio.h>
00036 # include <stdlib.h>
00037 #endif
00038
00039 #include <ctype.h>
00040 #include <vector>
00041 #include <string>
00042 #include <map>
00043
00044 namespace moot {
00045
00046
00047
00048 struct TrieVectorNodeBase
00049 {
00050 size_t mother;
00051 size_t mindtr;
00052
00053 TrieVectorNodeBase(size_t mother_index=0, size_t mindtr_index=0)
00054 : mother(mother_index), mindtr(mindtr_index)
00055 {};
00056 };
00057
00058
00059
00061 template <typename DataT, typename CharT = char, typename UCharT = unsigned char>
00062 struct TrieVectorNode : public TrieVectorNodeBase
00063 {
00064 typedef DataT data_type;
00065 typedef CharT char_type;
00066 typedef UCharT uchar_type;
00067 typedef TrieVectorNode<data_type,char_type,uchar_type> node_type;
00068
00069 CharT label;
00070 UCharT ndtrs;
00071 DataT data;
00072
00073 TrieVectorNode(size_t mother_index=0,
00074 size_t mindtr_index=0,
00075 CharT node_label=0,
00076 UCharT node_ndtrs=0)
00077 : TrieVectorNodeBase(mother_index, mindtr_index),
00078 label(node_label),
00079 ndtrs(node_ndtrs)
00080 {};
00081
00082 TrieVectorNode(size_t mother_index,
00083 size_t mindtr_index,
00084 CharT node_label,
00085 UCharT node_ndtrs,
00086 const DataT &node_data)
00087 : TrieVectorNodeBase(mother_index, mindtr_index),
00088 label(node_label),
00089 ndtrs(node_ndtrs),
00090 data(node_data)
00091 {};
00092
00093 inline bool operator< (const TrieVectorNode &x) const
00094 { return mother < x.mother || label < x.label; };
00095
00096 inline bool operator<= (const TrieVectorNode &x) const
00097 { return mother <= x.mother || label <= x.label; };
00098
00099 inline bool operator== (const TrieVectorNode &x) const
00100 { return mother==x.mother && label==x.label; };
00101 };
00102
00103
00104
00105
00107 class TrieVectorBase {
00108 public:
00110 typedef size_t NodeId;
00111
00112 public:
00114 static const NodeId NoNode = (size_t)-1;
00115
00117 static const size_t NoMaxLen = (size_t)-1;
00118
00119 public:
00120 size_t trie_maxlen;
00121 bool trie_use_case;
00122
00123 public:
00124 TrieVectorBase(size_t maxlen=NoMaxLen, bool use_case=false)
00125 : trie_maxlen(maxlen),
00126 trie_use_case(use_case)
00127 {};
00128
00129 };
00130
00131
00132
00133
00146 template<class DataT, typename CharT = char, typename UCharT = unsigned char>
00147 class TrieVector
00148 : public TrieVectorBase,
00149 public std::vector<TrieVectorNode<DataT,CharT,UCharT> >
00150 {
00151 public:
00152
00153
00154 typedef DataT data_type;
00155 typedef CharT char_type;
00156 typedef UCharT uchar_type;
00157
00158 typedef
00159 TrieVector<data_type,char_type,uchar_type>
00160 trie_type;
00161
00162 typedef
00163 TrieVectorNode<data_type,char_type,uchar_type>
00164 node_type;
00165
00166 typedef std::vector<node_type> vector_type;
00167
00168 typedef typename vector_type::iterator iterator;
00169 typedef typename vector_type::const_iterator const_iterator;
00170
00171
00172
00173 typedef std::string string_type;
00174
00175 typedef typename string_type::iterator string_iterator;
00176 typedef typename string_type::const_iterator const_string_iterator;
00177 typedef typename string_type::reverse_iterator reverse_string_iterator;
00178 typedef typename string_type::const_reverse_iterator const_reverse_string_iterator;
00179
00180 typedef std::map<string_type,NodeId> map_type;
00181 typedef typename map_type::iterator map_iterator;
00182 typedef typename map_type::const_iterator const_map_iterator;
00183
00184 public:
00185
00186
00187 map_type trie_pending;
00188 data_type trie_default_data;
00189
00190 public:
00191
00192
00193
00194
00196
00197
00198 TrieVector(size_t max_len=NoMaxLen, bool use_case=false)
00199 : TrieVectorBase(max_len,use_case)
00200 {};
00201
00203 ~TrieVector(void) {};
00204
00206 inline void clear(void)
00207 {
00208 vector_type::clear();
00209 trie_pending.clear();
00210
00211 };
00213
00214
00216
00217
00218 inline const size_t &maxlen(void) const
00219 { return trie_maxlen; };
00220
00222 inline size_t &maxlen(void)
00223 { return trie_maxlen; };
00224
00226 inline bool compiled(void) const
00227 { return !trie_pending.empty(); };
00228
00230 inline void ensure_compiled(void)
00231 { if (!compiled()) compile(); };
00232
00234 inline const DataT &default_data(void) const
00235 { return trie_default_data; };
00236
00238 inline DataT &default_data(void)
00239 { return trie_default_data; };
00241
00242
00244
00245
00247 inline string_type trie_canonicalize(string_type &s) const
00248 {
00249 if (!trie_use_case) {
00250 for (string_iterator si = s.begin(); si != s.end(); si++) {
00251 *si = tolower(*si);
00252 }
00253 }
00254 return s;
00255 };
00256
00258 inline void trie_key(const string_type &s,
00259 const size_t max_len,
00260 string_type &dst)
00261 const
00262 {
00263 dst.assign(s,0,max_len);
00264 trie_canonicalize(dst);
00265 };
00266
00268 inline string_type trie_key(const string_type &s, const size_t max_len)
00269 const
00270 {
00271 string_type key;
00272 trie_key(s,max_len,key);
00273 return key;
00274 };
00275
00277 inline string_type trie_key(const string_type &s) const
00278 { return trie_key(s,trie_maxlen); };
00279
00280
00282 inline void trie_rkey(const string_type &s,
00283 const size_t max_len,
00284 string_type &dst)
00285 const
00286 {
00287 dst.assign(s.rbegin(), s.rbegin() + (max_len > s.size() ?
00288 s.size() :
00289 max_len));
00290 trie_canonicalize(dst);
00291 };
00292
00294 inline string_type trie_rkey(const string_type &s, size_t max_len) const
00295 {
00296 string_type key;
00297 trie_rkey(s, max_len, key);
00298 return key;
00299 };
00300
00302 inline string_type trie_rkey(const string_type &s) const
00303 { return trie_rkey(s,trie_maxlen); };
00305
00306
00308
00309
00310 inline void trie_insert(const string_type &s, size_t max_len)
00311 { trie_pending[trie_key(s,max_len)] = 0; };
00312
00314 inline void trie_insert(const string_type &s)
00315 { trie_pending[trie_key(s,trie_maxlen)] = 0; };
00316
00318 inline void trie_rinsert(const string_type &s, size_t max_len)
00319 { trie_pending[trie_rkey(s,max_len)] = 0; };
00320
00322 inline void trie_rinsert(const string_type &s)
00323 { trie_pending[trie_rkey(s,trie_maxlen)] = 0; };
00325
00326
00327
00329
00330
00339 inline iterator find_dtr(const node_type &from, CharT label)
00340 {
00341 UCharT dn;
00342 iterator di;
00343 if (!trie_use_case) label = tolower(label);
00344 for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
00345 if (di->label == label) return di;
00346 }
00347 return this->end();
00348 };
00349
00351 inline const_iterator find_dtr(const node_type &from, CharT label) const
00352 {
00353 UCharT dn;
00354 const_iterator di;
00355 if (!trie_use_case) label = tolower(label);
00356 for (dn=0, di=this->begin()+from.mindtr; di != this->end() && dn < from.ndtrs; di++, dn++) {
00357 if (di->label == label) return di;
00358 }
00359 return this->end();
00360 };
00361
00363 inline NodeId find_dtr_id(NodeId fromid, CharT label) const
00364 {
00365 const_iterator di = find_dtr(*(this->begin()+fromid), label);
00366 return (di==this->end() ? NoNode : (di-this->begin()));
00367 };
00368
00369
00379 inline iterator first_dtr(const node_type &from)
00380 { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
00381
00383 inline const_iterator first_dtr(const node_type &from) const
00384 { return ( from.ndtrs == 0 ? this->end() : (this->begin()+from.mindtr) ); };
00385
00395 inline iterator find_mother(const node_type &to)
00396 { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
00397
00399 inline const_iterator find_mother(const node_type &to) const
00400 { return (to.mother == NoNode ? this->end() : (this->begin()+to.mother)); };
00401
00403 inline NodeId find_mother_id(NodeId toid) const
00404 { return (this->begin()+toid)->mother; };
00405
00407 inline string_type node_rstring(const node_type &node) const
00408 {
00409 if (node.mother == NoNode) return string_type();
00410 string_type s(1, node.label);
00411 const_iterator mi;
00412 for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00413 s.push_back(mi->label);
00414 }
00415 return s;
00416 };
00417
00419 inline string_type node_rstring(NodeId nodeid) const
00420 { return node_rstring(*(this->begin()+nodeid)); };
00421
00423 inline string_type node_string(const node_type &node) const
00424 {
00425 string_type s = node_rstring(node);
00426 reverse(s.begin(),s.end());
00427 return s;
00428 };
00429
00431 inline string_type node_string(NodeId nodeid) const
00432 { return node_string(*(this->begin()+nodeid)); };
00433
00434
00436 inline size_t node_depth(const node_type &node) const
00437 {
00438 size_t depth = 0;
00439 const_iterator mi;
00440 for (mi=find_mother(node); mi != this->end() && mi->mother != NoNode; mi=find_mother(*mi)) {
00441 ++depth;
00442 }
00443 return depth;
00444 };
00445
00447 inline size_t node_depth(NodeId nodeid) const
00448 { return node_depth(*(this->begin()+nodeid)); };
00450
00451
00452
00454
00455
00456 inline void compile(void)
00457 {
00458 vector_type::clear();
00459
00460 push_back(node_type(NoNode,NoNode,0,0,trie_default_data));
00461
00462 map_iterator pi;
00463 size_t pos;
00464 bool changed;
00465 char_type dlabel;
00466 NodeId dnodid;
00467
00468
00469 for (pos=0, changed=true; pos < trie_maxlen && changed; pos++) {
00470 changed = false;
00471
00472
00473 for (pi=trie_pending.begin(); pi != trie_pending.end(); pi++) {
00474 const string_type &kstr = pi->first;
00475 NodeId &knodid = pi->second;
00476 if (kstr.size() <= pos) continue;
00477
00478 dlabel = kstr[pos];
00479 dnodid = find_dtr_id(knodid,dlabel);
00480
00481 if (dnodid == NoNode) {
00482 dnodid = vector_type::size();
00483
00484
00485 push_back(node_type(knodid,
00486 NoNode,
00487 dlabel,
00488 0,
00489 trie_default_data));
00490
00491 node_type &mnode = this->operator[](knodid);
00492 ++mnode.ndtrs;
00493
00494 if (mnode.mindtr == NoNode)
00495 mnode.mindtr = dnodid;
00496
00497 changed = true;
00498 }
00499
00500 knodid = dnodid;
00501 }
00502 }
00503
00504 trie_pending.clear();
00505 };
00507
00508
00509
00511
00512
00522 inline iterator find_longest(const string_type &s,
00523 size_t *matchlen=NULL)
00524 {
00525 const_string_iterator si;
00526 iterator di, i = this->begin();
00527 size_t pos;
00528
00529 for (si = s.begin() , di = i , pos=0;
00530 si != s.end() && pos < trie_maxlen;
00531 si++ , i = di , pos++)
00532 {
00533 di = find_dtr(*di,*si);
00534 if (di==this->end()) break;
00535 }
00536 if (matchlen) *matchlen = pos;
00537 return i;
00538 };
00539
00541 inline const_iterator find_longest(const string_type &s,
00542 size_t *matchlen=NULL)
00543 const
00544 {
00545 const_string_iterator si;
00546 const_iterator di, i = this->begin();
00547 size_t pos;
00548
00549 for (si = s.begin() , di = i , pos=0;
00550 si != s.end() && pos < trie_maxlen;
00551 si++ , i = di , pos++)
00552 {
00553 di = find_dtr(*di,*si);
00554 if (di==this->end()) break;
00555 }
00556 if (matchlen) *matchlen = pos;
00557 return i;
00558 };
00559
00565 inline iterator rfind_longest(const string_type &s,
00566 size_t *matchlen=NULL)
00567 {
00568 const_reverse_string_iterator si;
00569 iterator di, i = this->begin();
00570 size_t pos;
00571
00572 for (si = s.rbegin() , di = i , pos=0;
00573 si != s.rend() && pos < trie_maxlen;
00574 si++ , i = di , pos++)
00575 {
00576 di = find_dtr(*di,*si);
00577 if (di==this->end()) break;
00578 }
00579 if (matchlen) *matchlen = pos;
00580 return i;
00581 };
00582
00584 inline const_iterator rfind_longest(const string_type &s,
00585 size_t *matchlen=NULL)
00586 const
00587 {
00588 const_reverse_string_iterator si;
00589 const_iterator di, i = this->begin();
00590 size_t pos;
00591
00592 for (si = s.rbegin() , di = i , pos=0;
00593 si != s.rend() && pos < trie_maxlen;
00594 si++ , i = di , pos++)
00595 {
00596 di = find_dtr(*di,*si);
00597 if (di==this->end()) break;
00598 }
00599 if (matchlen) *matchlen = pos;
00600 return i;
00601 };
00603
00604 #ifdef MOOT_TRIE_VECTOR_DEBUG
00605
00607
00608
00609 * using @delim as a delmiter
00610 */
00611 void dump(FILE *out, const CharT delim=0)
00612 {
00613 const_iterator i, mi;
00614 for (i = this->begin(); i != this->end(); i++) {
00615 string_type s = node_rstring(*i);
00616 if (s.empty()) continue;
00617 s.push_back(delim);
00618 fwrite(s.data(), sizeof(CharT), s.size(), out);
00619 }
00620 };
00621
00623 void bindump(FILE *out) {
00624 for (const_iterator i=this->begin(); i != this->end(); i++) {
00625 fwrite(&(*i), sizeof(node_type), 1, out);
00626 }
00627 };
00628
00630 void arcdump(FILE *out) {
00631 for (const_iterator i=this->begin(); i != this->end(); i++) {
00632 fprintf(out,"node=%u\t mom=%u\t mindtr=%u\t label=%c\t ndtrs=%u\n",
00633 i-this->begin(), i->mother, i->mindtr, i->label, i->ndtrs);
00634 }
00635 };
00637 #endif //-- /MOOT_TRIE_VECTOR_DEBUG
00638 };
00639
00640 };
00641
00642 #endif //-- MOOT_TRIE_VECTOR_H