ddc
Lemmatizers.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-*/
2 //
3 // This file is part of DDC.
4 //
5 // DDC is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU Lesser General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // DDC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // ========== Dialing Lemmatizer (www.aot.ru)
19 // ========== Copyright by Alexey Sokirko, Andrey Putrin, Bryan Jurish (2011)
20 
21 #ifndef __LEMMATIZERS_H_
22 #define __LEMMATIZERS_H_
23 
24 #pragma warning (disable : 4786)
25 
26 #include "MorphDict.h"
27 #include "Paradigm.h"
28 #include "Statistic.h"
29 #include "Predict.h"
30 #include "../CommonLib/ddcIconv.h"
31 
32 #pragma warning (disable : 4250)
33 
34 class CGraphmatFile;
35 
36 
37 class CLemmatizer : public CMorphDict
38 {
39 protected:
40  string m_Registry;
41 
42  // Postfixes-particles, that do not change the meaning
43  set<string> m_HyphenPostfixes;
44  // productive prefixes
45  set<string> m_HyphenPrefixes;
46 
49  set<string> m_PrefixesSet;
50 
51 
52  virtual void FilterSrc(string& src) const = 0;
53 
54  string GetRegistryString() const {return m_Registry; };
55  string GetPath() const;
56  void ReadOptions(string FileName);
57 
58 
59  bool LemmatizeWord(string& InputWordStr, const bool cap, const bool predict, vector<CAutomAnnotationInner>& results, bool bGetLemmaInfos) const;
60  void AssignWeightIfNeed(vector<CAutomAnnotationInner>& FindResults) const;
61 
62  // prediction by suffix
63  bool CheckAbbreviation(string InputWordStr,vector<CAutomAnnotationInner>& FindResults, bool is_cap) const;
65  void PredictByDataBase(string InputWordStr, vector<CAutomAnnotationInner>& results,bool is_cap) const;
66  bool IsPrefix(const string& Prefix) const;
67 
68 
69 public:
70  bool m_bLoaded;
74 
75 
76 public:
77  //-- BEGIN moo: character set hacks
78  std::string m_enc_int;
79  std::string m_enc_ext;
82  //-- END moo
83 
84 
86  virtual ~CLemmatizer();
87 
88  // basic methods
90  const CStatistic& GetStatistic() const;
91  bool CheckABC(const string& WordForm) const;
92  bool IsHyphenPostfix(const string& Postfix) const;
93  bool IsHyphenPrefix(const string& Prefix) const;
94 
95 
96  //-- moo: character set hacks
97  bool initIconv(const string &enc_internal="", const string &enc_external="UTF8");
98  inline std::string recode_ext2int(const std::string s_ext) const { return m_ic_ext2int==NULL ? s_ext : m_ic_ext2int->convert(s_ext); }
99  inline std::string recode_int2ext(const std::string s_int) const { return m_ic_int2ext==NULL ? s_int : m_ic_int2ext->convert(s_int); }
100 
101 
102  // loading
103  bool LoadDictionariesRegistry(string& strError) ;
104 
105  // main interfaces
106  bool CreateParadigmCollection(bool bNorm, string& WordStr, bool capital, bool bUsePrediction, vector<CFormInfo>& Result) const;
107  void GetAllAncodesQuick(const BYTE* WordForm, bool capital, BYTE* OutBuffer, bool bUsePrediction) const;
108  bool GetAllAncodesAndLemmasQuick(string& InputWordStr, bool capital, char* OutBuffer, size_t MaxBufferSize, bool bUsePrediction) const;
109 };
110 
111 
112 
113 
114 
117 {
118 public:
120  virtual ~CLemmatizerRussian() {};
121  void FilterSrc(string& src) const;
122 };
123 
124 
126 {
127  void FilterSrc(string& src) const;
128 public:
130  virtual ~CLemmatizerEnglish() {};
131 
132 };
133 
134 
135 
137 
139 {
140  void FilterSrc(string& src) const;
141 public:
143  virtual ~CLemmatizerGerman() {};
144 
145 };
146 
147 
148 
149 
150 
151 #endif //__LEMMATIZERS_H_
152 
153 /*--- emacs style variables ---
154  * Local Variables:
155  * mode: C++
156  * c-file-style: "ellemtel"
157  * c-basic-offset: 4
158  * tab-width: 8
159  * indent-tabs-mode: nil
160  * End:
161  */
void PredictByDataBase(string InputWordStr, vector< CAutomAnnotationInner > &results, bool is_cap) const
Definition: Lemmatizers.cpp:469
bool IsHyphenPrefix(const string &Prefix) const
Definition: Lemmatizers.cpp:71
Interface to iconv.h character-conversion routines.
Definition: ddcIconv.h:58
ddcIconv * m_ic_ext2int
iconv converter from user encoding to morph-internal encoding
Definition: Lemmatizers.h:80
virtual ~CLemmatizerEnglish()
Definition: Lemmatizers.h:130
bool m_bMaximalPrediction
Definition: Lemmatizers.h:71
CPredictBase m_Predict
Definition: Lemmatizers.h:48
void GetAllAncodesQuick(const BYTE *WordForm, bool capital, BYTE *OutBuffer, bool bUsePrediction) const
Definition: Lemmatizers.cpp:178
bool m_bAllowRussianJo
Definition: Lemmatizers.h:73
set< string > m_PrefixesSet
Definition: Lemmatizers.h:49
bool LoadDictionariesRegistry(string &strError)
Definition: Lemmatizers.cpp:286
bool m_bLoaded
Definition: Lemmatizers.h:70
Definition: Predict.h:39
bool IsPrefix(const string &Prefix) const
Definition: Lemmatizers.cpp:84
CStatistic m_Statistic
Definition: Lemmatizers.h:47
bool convert(const char *idata, const size_t ilen, string &out)
Definition: ddcIconv.h:162
MorphLanguageEnum GetLanguage() const
Definition: Lemmatizers.h:89
virtual ~CLemmatizerRussian()
Definition: Lemmatizers.h:120
void ReadOptions(string FileName)
Definition: Lemmatizers.cpp:259
string GetPath() const
Definition: Lemmatizers.cpp:45
bool CreateParadigmCollection(bool bNorm, string &WordStr, bool capital, bool bUsePrediction, vector< CFormInfo > &Result) const
Definition: Lemmatizers.cpp:360
std::string m_enc_ext
external encoding (default=""=none)
Definition: Lemmatizers.h:79
virtual void FilterSrc(string &src) const =0
Definition: Lemmatizers.h:116
CAutomAnnotationInner ConvertPredictTupleToAnnot(const CPredictTuple &input) const
Definition: Lemmatizers.cpp:445
bool IsHyphenPostfix(const string &Postfix) const
Definition: Lemmatizers.cpp:66
Definition: MorphDict.h:47
std::string m_enc_int
internal encoding (default=""=none)
Definition: Lemmatizers.h:78
set< string > m_HyphenPrefixes
Definition: Lemmatizers.h:45
virtual ~CLemmatizer()
Definition: Lemmatizers.cpp:38
ddcIconv * m_ic_int2ext
iconv converter from morph-internal encoding to user encoding
Definition: Lemmatizers.h:81
std::string recode_int2ext(const std::string s_int) const
Definition: Lemmatizers.h:99
CMorphAutomat * m_pFormAutomat
Definition: MorphDict.h:52
string GetRegistryString() const
Definition: Lemmatizers.h:54
void AssignWeightIfNeed(vector< CAutomAnnotationInner > &FindResults) const
Definition: Lemmatizers.cpp:164
std::string recode_ext2int(const std::string s_ext) const
Definition: Lemmatizers.h:98
bool initIconv(const string &enc_internal="", const string &enc_external="UTF8")
Definition: Lemmatizers.cpp:275
set< string > m_HyphenPostfixes
Definition: Lemmatizers.h:43
bool CheckAbbreviation(string InputWordStr, vector< CAutomAnnotationInner > &FindResults, bool is_cap) const
Definition: Lemmatizers.cpp:457
bool LemmatizeWord(string &InputWordStr, const bool cap, const bool predict, vector< CAutomAnnotationInner > &results, bool bGetLemmaInfos) const
Definition: Lemmatizers.cpp:92
Definition: Predict.h:30
string m_Registry
Definition: Lemmatizers.h:40
unsigned char BYTE
Definition: utilit.h:94
bool GetAllAncodesAndLemmasQuick(string &InputWordStr, bool capital, char *OutBuffer, size_t MaxBufferSize, bool bUsePrediction) const
Definition: Lemmatizers.cpp:210
bool CheckABC(const string &WordForm) const
Definition: Lemmatizers.cpp:61
Definition: Lemmatizers.h:138
Definition: Statistic.h:28
MorphLanguageEnum m_Language
Definition: MorphAutomat.h:118
Definition: MorphAutomat.h:91
MorphLanguageEnum
Definition: utilit.h:162
Definition: Lemmatizers.h:37
CLemmatizer(MorphLanguageEnum Language)
Definition: Lemmatizers.cpp:25
bool m_bUseStatistic
Definition: Lemmatizers.h:72
const CStatistic & GetStatistic() const
Definition: Lemmatizers.cpp:77
Definition: GraphmatFile.h:28
Definition: Lemmatizers.h:125
virtual ~CLemmatizerGerman()
Definition: Lemmatizers.h:143