mootSuffixTrie.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2009 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*--------------------------------------------------------------------------
23  * File: mootSuffixTrie.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * + moocow's PoS tagger : suffix trie
27  *--------------------------------------------------------------------------*/
28 
34 #ifndef _MOOT_SUFFIX_TRIE_H
35 #define _MOOT_SUFFIX_TRIE_H
36 
37 #include <mootEnum.h>
38 #include <mootLexfreqs.h>
39 #include <mootNgrams.h>
40 #include <mootAssocVector.h>
41 #include <mootTrieVector.h>
42 
43 moot_BEGIN_NAMESPACE
44 
47 
49 //template<>
50 class SuffixTrie : public TrieVector<SuffixTrieDataT>
51 {
52 public:
53  //------------------------------------------------------------
54  // SuffixTrie: Static Data
55  //static const size_t SuffixTrieDefaultMaxLen = 10;
56  static const size_t SuffixTrieDefaultMaxLen = 0;
57 
58 public:
59  //------------------------------------------------------------
60  // SuffixTrie: Types
61 
63  typedef mootEnumID TagID;
64 
67 
70 
71 public:
72  //------------------------------------------------------------
73  // SuffixTrie: data
74  CountT maxcount;
75  ProbT theta;
76 
77 
78 public:
79  //------------------------------------------------------------
80  // SuffixTrie: Methods
81 
82  //--------------------------------------------------
84 
85 
86  SuffixTrie(size_t max_length =SuffixTrieDefaultMaxLen,
87  bool use_case =true,
88  size_t max_count =10)
89  : TrieType(max_length,use_case),
90  maxcount(max_count)
91  {};
92 
94  ~SuffixTrie(void) {};
96 
97  //--------------------------------------------------
99 
100 
102  bool build(const mootLexfreqs &lf,
103  const mootNgrams &ng,
104  const TagIDTable &tagids,
105  TagID eos_tagid,
106  bool verbose=false);
107 
110  bool _build_insert(const mootLexfreqs &lf);
111 
114  bool _build_assoc(const mootLexfreqs &lf, const TagIDTable &tagids);
115 
118  bool _build_compute_theta(const mootLexfreqs &lf,
119  const mootNgrams &ng,
120  const TagIDTable &tagids,
121  TagID eos_tagid);
122 
125  bool _build_compute_mles(const mootLexfreqs &lf,
126  const mootNgrams &ng,
127  const TagIDTable &tagids,
128  TagID eos_tagid);
129 
133  bool _build_invert_mles(const mootNgrams &ng,
134  const TagIDTable &tagids,
135  TagID eos_tagid);
137 
138  //--------------------------------------------------
140 
141 
143  inline iterator find_ancestor_nonempty(iterator dtr, size_t *matchlen=NULL)
144  {
145  if (matchlen) *matchlen = 0;
146  for ( ; dtr != end() && dtr->data.empty(); dtr=find_mother(*dtr)) {
147  if (matchlen) (*matchlen)--;
148  }
149  return dtr;
150  };
151 
154  inline const_iterator const_find_ancestor_nonempty(const_iterator dtr, size_t *matchlen=NULL)
155  const
156  {
157  if (matchlen) *matchlen = 0;
158  for ( ; dtr != end() && dtr->data.empty(); dtr=find_mother(*dtr)) {
159  if (matchlen) (*matchlen)--;
160  }
161  return dtr;
162  };
163 
164 
167  size_t *matchlen=NULL)
168  { return find_ancestor_nonempty(rfind_longest(tokstr,matchlen),matchlen); };
169 
171  inline const_iterator rfind_longest_nonempty(const mootTokString &tokstr,
172  size_t *matchlen=NULL)
173  const
174  { return const_find_ancestor_nonempty(rfind_longest(tokstr,matchlen),matchlen); };
175 
176 
179  inline const SuffixTrieDataT& sufprobs(const mootTokString &tokstr, size_t *matchlen=NULL)
180  const
181  {
182  const_iterator ti = rfind_longest_nonempty(tokstr,matchlen);
183  return (ti==end() ? default_data() : ti->data);
184  };
186 
187  //--------------------------------------------------
189 
190  void txtdump(FILE *out, const TagIDTable &tagids) const;
192 
193 }; //-- /class SuffixTrie
194 
195 moot_END_NAMESPACE
196 
197 #endif // _MOOT_SUFFIX_TRIE_H
iterator rfind_longest_nonempty(const mootTokString &tokstr, size_t *matchlen=__null)
Definition: mootSuffixTrie.h:166
Top-level class for suffix tries.
Definition: mootSuffixTrie.h:46
Top-level trie class-template using an adjaceny table.
Definition: mootTrieVector.h:143
mootEnumID TagID
Definition: mootSuffixTrie.h:59
HMM training data: lexical frequencies: raw.
vector_type::iterator iterator
Definition: mootTrieVector.h:164
LISP-style assoc list using vector<>: map-like class with small memory footprint. Useful for small as...
Definition: mootAssocVector.h:130
Definition: mootEnum.h:67
AssocVector< mootEnumID, ProbT > SuffixTrieDataT
Definition: mootSuffixTrie.h:42
trie implementation using std::vector<> for underlying storage
Class for storage & retrieval of raw N-Gram frequencies.
Definition: mootNgrams.h:44
Class for storage and retrieval of raw lexical frequencies.
Definition: mootLexfreqs.h:44
float ProbT
Definition: mootTypes.h:63
vector_type::const_iterator const_iterator
Definition: mootTrieVector.h:165
runtime enumerations (1-1 maps: symbolic identifiers <-> unsigned integers)
ProbT CountT
Definition: mootTypes.h:67
HMM training data: n-gram frequencies: raw.
LISP-style assoc vectors.
moot::UInt mootEnumID
Definition: mootEnum.h:45
string mootTokString
Definition: mootToken.h:62