Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootSuffixTrie.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /*--------------------------------------------------------------------------
00023  * File: mootSuffixTrie.h
00024  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00025  * Description:
00026  *   + moocow's PoS tagger : suffix trie
00027  *--------------------------------------------------------------------------*/
00028 
00029 #ifndef _MOOT_SUFFIX_TRIE_H
00030 #define _MOOT_SUFFIX_TRIE_H
00031 
00032 #include <mootTypes.h>
00033 #include <mootToken.h>
00034 #include <mootLexfreqs.h>
00035 #include <mootNgrams.h>
00036 #include <mootEnum.h>
00037 #include <mootAssocVector.h>
00038 #include <mootTrieVector.h>
00039 
00040 moot_BEGIN_NAMESPACE
00041 
00043 typedef AssocVector<mootEnumID,ProbT> SuffixTrieDataT;
00044 
00046 //template<>
00047 class SuffixTrie : public TrieVector<SuffixTrieDataT>
00048 {
00049 public:
00050   //------------------------------------------------------------
00051   // SuffixTrie: Static Data
00052   //static const size_t SuffixTrieDefaultMaxLen = 10;
00053   static const size_t SuffixTrieDefaultMaxLen = 0;
00054 
00055 public:
00056   //------------------------------------------------------------
00057   // SuffixTrie: Types
00058 
00060   typedef mootEnumID              TagID;
00061 
00063   typedef mootEnum<mootTagString> TagIDTable;
00064 
00066   typedef TrieVector<SuffixTrieDataT>  TrieType;
00067 
00068 public:
00069   //------------------------------------------------------------
00070   // SuffixTrie: data
00071   CountT        maxcount;  
00072   ProbT         theta;     
00073 
00074 
00075 public:
00076   //------------------------------------------------------------
00077   // SuffixTrie: Methods
00078 
00079   //--------------------------------------------------
00081 
00082 
00083   SuffixTrie(size_t max_length =SuffixTrieDefaultMaxLen,
00084              bool   use_case   =true,
00085              size_t max_count  =10)
00086     : TrieType(max_length,use_case),
00087       maxcount(max_count)
00088   {};
00089 
00091   ~SuffixTrie(void) {};
00093 
00094   //--------------------------------------------------
00096 
00097 
00099   bool build(const mootLexfreqs &lf,
00100              const mootNgrams   &ng,
00101              const TagIDTable   &tagids,
00102              TagID eos_tagid,
00103              bool  verbose=false);
00104 
00107   bool _build_insert(const mootLexfreqs &lf);
00108 
00111   bool _build_assoc(const mootLexfreqs &lf, const TagIDTable &tagids);
00112 
00115   bool _build_compute_theta(const mootLexfreqs &lf,
00116                             const mootNgrams   &ng,
00117                             const TagIDTable   &tagids,
00118                             TagID eos_tagid);
00119 
00122   bool _build_compute_mles(const mootLexfreqs &lf,
00123                            const mootNgrams   &ng,
00124                            const TagIDTable   &tagids,
00125                            TagID eos_tagid);
00126 
00130   bool _build_invert_mles(const mootNgrams &ng,
00131                           const TagIDTable &tagids,
00132                           TagID eos_tagid);
00134 
00135   //--------------------------------------------------
00137 
00138 
00140   inline iterator find_ancestor_nonempty(iterator dtr, size_t *matchlen=NULL)
00141   {
00142     if (matchlen) *matchlen = 0;
00143     for ( ; dtr != end() && dtr->data.empty(); dtr=find_mother(*dtr)) {
00144       if (matchlen) (*matchlen)--;
00145     }
00146     return dtr;
00147   };
00148 
00151   inline const_iterator const_find_ancestor_nonempty(const_iterator dtr, size_t *matchlen=NULL)
00152     const
00153   {
00154     if (matchlen) *matchlen = 0;
00155     for ( ; dtr != end() && dtr->data.empty(); dtr=find_mother(*dtr)) {
00156       if (matchlen) (*matchlen)--;
00157     }
00158     return dtr;
00159   };
00160 
00161 
00163   inline iterator rfind_longest_nonempty(const mootTokString &tokstr,
00164                                          size_t              *matchlen=NULL)
00165   { return find_ancestor_nonempty(rfind_longest(tokstr,matchlen),matchlen); };
00166 
00168   inline const_iterator rfind_longest_nonempty(const mootTokString &tokstr,
00169                                                size_t              *matchlen=NULL)
00170     const
00171   { return const_find_ancestor_nonempty(rfind_longest(tokstr,matchlen),matchlen); };
00172 
00173 
00176   inline const SuffixTrieDataT& sufprobs(const mootTokString &tokstr, size_t *matchlen=NULL)
00177     const
00178   {
00179     const_iterator ti = rfind_longest_nonempty(tokstr,matchlen);
00180     return (ti==end() ? default_data() : ti->data);
00181   };
00183 
00184   //--------------------------------------------------
00186 
00187   void txtdump(FILE *out, const TagIDTable &tagids) const;
00189 
00190 }; //-- /class SuffixTrie
00191 
00192 moot_END_NAMESPACE
00193 
00194 #endif // _MOOT_SUFFIX_TRIE_H

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913