00001 // ========== This file is under LGPL, the GNU Lesser General Public Licence 00002 // ========== Dialing Lemmatizer (www.aot.ru), 00003 // ========== Copyright by Alexey Sokirko (2004) 00004 00005 00006 // The main function of this header is CMorphAutomatBuilder::AddStringDaciuk 00007 // This function is an implementation of algorithm for constructing minimal, 00008 // deterministic, acyclic FSM from unordered set of strings, which was described 00009 // in "Jan Daciuk, Stoyan Mihov, Bruce Watson, and Richard Watson, 00010 // Incremental Construction of Minimal Acyclic Finite State Automata, 00011 // Computational Linguistics, 26(1), March 2000." 00012 00013 #ifndef MorphAutomBuilder_h 00014 #define MorphAutomBuilder_h 00015 00016 #include "MorphAutomat.h" 00017 00018 00019 00020 struct CTrieNodeBuild; 00021 struct IsLessRegister: public less<CTrieNodeBuild*> 00022 { 00023 bool operator ()(const CTrieNodeBuild* pNodeNo1, const CTrieNodeBuild* pNodeNo2) const; 00024 }; 00025 00026 typedef set<CTrieNodeBuild*, IsLessRegister> CTrieRegister; 00027 00028 00029 00030 00031 struct CTrieNodeBuild 00032 { 00033 bool m_bFinal; 00034 int m_IncomingRelationsCount; 00035 CTrieNodeBuild* m_Children[MaxAlphabetSize]; 00036 CTrieRegister::iterator m_pRegister; 00037 bool m_bRegistered; 00038 int m_NodeId; 00039 BYTE m_FirstChildNo; 00040 BYTE m_SecondChildNo; 00041 00042 00043 void Initialize(); 00044 void AddChild(CTrieNodeBuild* Child, BYTE ChildNo); 00045 void ModifyChild(CTrieNodeBuild* Child, BYTE ChildNo, bool bUpdateIncoming); 00046 CTrieNodeBuild* GetNextNode(BYTE RelationChar) const; 00047 void SetNodeIdNullRecursive (); 00048 void UnregisterRecursive(); 00049 00050 00051 // debug function 00052 bool CheckIncomingRelationsCountRecursive(map<const CTrieNodeBuild*, size_t>& Node2Incoming) const; 00053 void GetIncomingRelationsCountRecursive(map<const CTrieNodeBuild*, size_t>& Node2Incoming) const; 00054 bool CheckRegisterRecursive() const; 00055 void SetFinal(bool bFinal); 00056 }; 00057 00058 00059 00060 00061 class CMorphAutomatBuilder : public CMorphAutomat 00062 { 00063 00064 CTrieNodeBuild* m_pRoot; 00065 00066 // a hash by the letter of the first child and the number of children 00067 CTrieRegister m_RegisterHash[MaxAlphabetSize+1][MaxAlphabetSize+1]; 00068 00069 vector<CTrieNodeBuild*> m_Prefix; 00070 00071 vector<CTrieNodeBuild*> m_DeletedNodes; 00072 00073 void ClearBuildNodes(); 00074 00075 CTrieNodeBuild* CreateNode(); 00076 void DeleteNode(CTrieNodeBuild* pNode); 00077 CTrieNodeBuild* CloneNode(const CTrieNodeBuild* pPrototype); 00078 00079 void UpdateCommonPrefix(const string& WordForm); 00080 CTrieNodeBuild* AddSuffix(CTrieNodeBuild* pParentNodeNo, const char* WordForm); 00081 CTrieNodeBuild* ReplaceOrRegister(CTrieNodeBuild* pNode); 00082 void DeleteFromRegister(CTrieNodeBuild* pNode); 00083 int GetFirstConfluenceState() const; 00084 void UnregisterNode(CTrieNodeBuild* pNode); 00085 00086 CTrieRegister& GetRegister(const CTrieNodeBuild* pNode); 00087 00088 // debug functions 00089 bool CheckRegister() const; 00090 bool IsValid() const; 00091 00092 00093 public: 00094 CMorphAutomatBuilder(MorphLanguageEnum Language, BYTE AnnotChar); 00095 ~CMorphAutomatBuilder(); 00096 00097 void InitTrie(); 00098 bool AddStringDaciuk(const string& WordForm); 00099 void ClearRegister(); 00100 void ConvertBuildRelationsToRelations(); 00101 }; 00102 00103 00104 00105 #endif