Go to the documentation of this file.00001
00002
00003
00004
00005 #ifndef MorphAutomat_h
00006 #define MorphAutomat_h
00007
00008 #include "../common/util_classes.h"
00009
00010 #if DDC_USE_UTF8
00011 const size_t MaxAlphabetSize = 182;
00012 #else
00013 const size_t MaxAlphabetSize = 54;
00014 #endif
00015
00016
00017 const size_t ChildrenCacheSize = 1000;
00018
00019 struct CMorphAutomNode
00020 {
00021
00022
00023
00024 DWORD m_Data;
00025
00026 DWORD GetChildrenStart() const
00027 {
00028 return m_Data&(0x80000000-1);
00029 };
00030 bool IsFinal() const
00031 {
00032 return (m_Data&0x80000000) > 0;
00033 };
00034 void SetChildrenStart(DWORD v)
00035 {
00036 m_Data = (0x80000000&m_Data)|v;
00037 };
00038 void SetFinal(bool v)
00039 {
00040 if (v)
00041 m_Data |= 0x80000000;
00042 else
00043 m_Data &= 0x80000000-1;
00044 };
00045
00046 };
00047
00048 struct CMorphAutomRelation
00049 {
00050
00051
00052
00053 DWORD m_Data;
00054
00055 DWORD GetChildNo() const
00056 {
00057 return m_Data & 0xffffff;
00058 };
00059 BYTE GetRelationalChar() const
00060 {
00061 return m_Data>>24;
00062 };
00063
00064 void SetChildNo(DWORD v)
00065 {
00066 m_Data = (0xff000000&m_Data)|v;
00067 };
00068
00069 void SetRelationalChar(BYTE v)
00070 {
00071 m_Data = (0xffffff&m_Data)| (v<<24);
00072 };
00073
00074 };
00075
00076 struct CAutomAnnotationInner
00077 {
00078
00079 WORD m_ModelNo;
00080 WORD m_ItemNo;
00081 WORD m_PrefixNo;
00082
00083
00084 int m_LemmaInfoNo;
00085 int m_nWeight;
00086
00087 DWORD GetParadigmId() const
00088 {
00089 return (m_PrefixNo<<23) | m_LemmaInfoNo;
00090 }
00091 void SplitParadigmId(DWORD value)
00092 {
00093 m_PrefixNo = value>>23;
00094 m_LemmaInfoNo = value&0x7fffff;
00095 }
00096 };
00097 const size_t MinimalPredictionSuffix = 3;
00098 const BYTE MorphAnnotChar = '+';
00099 class CABCEncoder
00100 {
00101 public:
00102
00103 MorphLanguageEnum m_Language;
00104 const BYTE m_AnnotChar;
00105 int m_AlphabetSize;
00106 int m_Alphabet2Code[256];
00107 int m_Code2Alphabet[MaxAlphabetSize];
00108
00109
00110 int m_AlphabetSizeWithoutAnnotator;
00111 int m_Alphabet2CodeWithoutAnnotator[256];
00112 int m_Code2AlphabetWithoutAnnotator[MaxAlphabetSize];
00113
00114 bool CheckABCWithAnnotator(const string& WordForm) const;
00115 bool CheckABCWithoutAnnotator(const string& WordForm) const;
00116 string EncodeIntToAlphabet(DWORD v) const;
00117 DWORD DecodeFromAlphabet(const string& v) const;
00118 string GetCriticalNounLetterPack() const;
00119 CABCEncoder(MorphLanguageEnum Language, BYTE AnnotChar);
00120
00121
00122 };
00123
00124 class CMorphAutomat : public CABCEncoder
00125 {
00126
00127 protected:
00128
00129 CMorphAutomNode* m_pNodes;
00130 size_t m_NodesCount;
00131
00132 CMorphAutomRelation* m_pRelations;
00133 size_t m_RelationsCount;
00134
00135 vector<int> m_ChildrenCache;
00136
00137
00138
00139 void DumpAllStringsRecursive(FILE* fp, int NodeNo, string CurrPath) const;
00140 void BuildChildrenCache();
00141 void GetAllMorphInterpsRecursive (int NodeNo, string& curr_path, vector<CAutomAnnotationInner>& Infos) const;
00142 int FindStringAndPassAnnotChar (const string& Text, size_t TextPos) const;
00143 void Clear();
00144
00145 public:
00146
00147 CMorphAutomat(MorphLanguageEnum Language, BYTE AnnotChar);
00148 ~CMorphAutomat();
00149 bool Load(string GrammarFileName);
00150 bool Save(string GrammarFileName) const;
00151 bool DumpAllStrings(string FileName) const;
00152 void GetInnerMorphInfos (const string& Text, size_t TextPos, vector<CAutomAnnotationInner>& Infos) const;
00153 const CMorphAutomRelation* GetChildren(size_t NodeNo) const;
00154 int NextNode(int NodeNo, BYTE Child) const;
00155 size_t GetChildrenCount(size_t NodeNo) const;
00156 const CMorphAutomNode& GetNode(int NodeNo) const { return m_pNodes[NodeNo];} ;
00157 DWORD EncodeMorphAutomatInfo (size_t ModelNo, size_t ItemNo, size_t PrefixNo) const;
00158 void DecodeMorphAutomatInfo (DWORD Info, size_t& ModelNo, size_t& ItemNo, size_t& PrefixNo) const;
00159 string GetFirstResult (const string& Text) const;
00160
00161 };
00162
00163
00164
00165 #endif