Go to the documentation of this file.00001
00002
00003
00004
00005
00006 #ifndef __LEMMATIZERS_H_
00007 #define __LEMMATIZERS_H_
00008
00009 #pragma warning (disable : 4786)
00010
00011 #include "MorphDict.h"
00012 #include "Paradigm.h"
00013 #include "Statistic.h"
00014 #include "Predict.h"
00015 #include "../common/ddcIconv.h"
00016
00017 #pragma warning (disable : 4250)
00018
00019 class CGraphmatFile;
00020
00021 typedef enum { subjFinance = 1, subjComputer = 2, subjLiterature = 4 } SubjectEnum;
00022
00023
00024 class CLemmatizer : public CMorphDict
00025 {
00026 protected:
00027 string m_Registry;
00028
00029
00030 set<string> m_HyphenPostfixes;
00031
00032 set<string> m_HyphenPrefixes;
00033
00034 CStatistic m_Statistic;
00035 CPredictBase m_Predict;
00036 set<string> m_PrefixesSet;
00037
00038
00039 virtual void FilterSrc(string& src) const = 0;
00040
00041 string GetRegistryString() const {return m_Registry; };
00042 string GetPath() const;
00043 void ReadOptions(string FileName);
00044
00045
00046 bool LemmatizeWord(string& InputWordStr, const bool cap, const bool predict, vector<CAutomAnnotationInner>& results, bool bGetLemmaInfos) const;
00047 void AssignWeightIfNeed(vector<CAutomAnnotationInner>& FindResults) const;
00048
00049
00050 bool CheckAbbreviation(string InputWordStr,vector<CAutomAnnotationInner>& FindResults, bool is_cap) const;
00051 CAutomAnnotationInner ConvertPredictTupleToAnnot(const CPredictTuple& input) const;
00052 void PredictByDataBase(string InputWordStr, vector<CAutomAnnotationInner>& results,bool is_cap) const;
00053 bool IsPrefix(const string& Prefix) const;
00054
00055
00056 public:
00057 bool m_bLoaded;
00058 bool m_bMaximalPrediction;
00059 bool m_bUseStatistic;
00060 bool m_bAllowRussianJo;
00061
00062
00063 public:
00064
00065 std::string m_enc_int;
00066 std::string m_enc_ext;
00067 ddcIconv *m_ic_ext2int;
00068 ddcIconv *m_ic_int2ext;
00069
00070
00071
00072 CLemmatizer(MorphLanguageEnum Language);
00073 virtual ~CLemmatizer();
00074
00075
00076 MorphLanguageEnum GetLanguage() const {return m_pFormAutomat->m_Language; };
00077 const CStatistic& GetStatistic() const;
00078 bool CheckABC(const string& WordForm) const;
00079 bool IsHyphenPostfix(const string& Postfix) const;
00080 bool IsHyphenPrefix(const string& Prefix) const;
00081
00082
00083
00084 bool initIconv(const string &enc_internal="", const string &enc_external="UTF8");
00085 inline std::string recode_ext2int(const std::string s_ext) const { return m_ic_ext2int==NULL ? s_ext : m_ic_ext2int->convert(s_ext); }
00086 inline std::string recode_int2ext(const std::string s_int) const { return m_ic_int2ext==NULL ? s_int : m_ic_int2ext->convert(s_int); }
00087
00088
00089
00090 bool LoadDictionariesRegistry(string& strError) ;
00091 bool LoadStatisticRegistry(SubjectEnum subj);
00092
00093
00094 bool CreateParadigmCollection(bool bNorm, string& WordStr, bool capital, bool bUsePrediction, vector<CFormInfo>& Result) const;
00095 void GetAllAncodesQuick(const BYTE* WordForm, bool capital, BYTE* OutBuffer, bool bUsePrediction) const;
00096
00097 bool GetAllAncodesAndLemmasQuick(string& InputWordStr, bool capital, char* OutBuffer, size_t MaxBufferSize, bool bUsePrediction) const;
00098 bool CreateParadigmFromID(DWORD id, CFormInfo& Result) const;
00099 bool ProcessHyphenWords(CGraphmatFile* piGraphmatFile) const;
00100
00101
00102 };
00103
00104
00105
00106
00107
00109 class CLemmatizerRussian : public CLemmatizer
00110 {
00111 public:
00112 CLemmatizerRussian();
00113 virtual ~CLemmatizerRussian() {};
00114 void FilterSrc(string& src) const;
00115 };
00116
00117
00118 class CLemmatizerEnglish : public CLemmatizer
00119 {
00120 void FilterSrc(string& src) const;
00121 public:
00122 CLemmatizerEnglish();
00123 virtual ~CLemmatizerEnglish() {};
00124
00125 };
00126
00127
00128
00130
00131 class CLemmatizerGerman: public CLemmatizer
00132 {
00133 void FilterSrc(string& src) const;
00134 public:
00135 CLemmatizerGerman();
00136 virtual ~CLemmatizerGerman() {};
00137
00138 };
00139
00140
00141
00142
00143
00144 #endif //__LEMMATIZERS_H_