ddc
|
#include <Lemmatizers.h>
Public Member Functions | |
CLemmatizer (MorphLanguageEnum Language) | |
virtual | ~CLemmatizer () |
MorphLanguageEnum | GetLanguage () const |
const CStatistic & | GetStatistic () const |
bool | CheckABC (const string &WordForm) const |
bool | IsHyphenPostfix (const string &Postfix) const |
bool | IsHyphenPrefix (const string &Prefix) const |
bool | initIconv (const string &enc_internal="", const string &enc_external="UTF8") |
std::string | recode_ext2int (const std::string s_ext) const |
std::string | recode_int2ext (const std::string s_int) const |
bool | LoadDictionariesRegistry (string &strError) |
bool | CreateParadigmCollection (bool bNorm, string &WordStr, bool capital, bool bUsePrediction, vector< CFormInfo > &Result) const |
void | GetAllAncodesQuick (const BYTE *WordForm, bool capital, BYTE *OutBuffer, bool bUsePrediction) const |
bool | GetAllAncodesAndLemmasQuick (string &InputWordStr, bool capital, char *OutBuffer, size_t MaxBufferSize, bool bUsePrediction) const |
Public Member Functions inherited from CMorphDict | |
CMorphDict (MorphLanguageEnum Language) | |
virtual | ~CMorphDict () |
void | InitAutomat (CMorphAutomat *pFormAutomat) |
bool | Load (string GrammarFileName) |
bool | Save (string GrammarFileName) const |
void | PredictBySuffix (const string &Text, size_t &TextOffset, size_t MinimalPredictSuffixlen, vector< CAutomAnnotationInner > &Infos) const |
string | GetAllMorphInterpsStr (const string &Text, const size_t TextPos, bool bFullInterp) const |
Public Attributes | |
bool | m_bLoaded |
bool | m_bMaximalPrediction |
bool | m_bUseStatistic |
bool | m_bAllowRussianJo |
std::string | m_enc_int |
internal encoding (default=""=none) More... | |
std::string | m_enc_ext |
external encoding (default=""=none) More... | |
ddcIconv * | m_ic_ext2int |
iconv converter from user encoding to morph-internal encoding More... | |
ddcIconv * | m_ic_int2ext |
iconv converter from morph-internal encoding to user encoding More... | |
Public Attributes inherited from CMorphDict | |
vector< CFlexiaModel > | m_FlexiaModels |
vector< CAccentModel > | m_AccentModels |
CShortStringHolder | m_Bases |
vector< CLemmaInfoAndLemma > | m_LemmaInfos |
StringVector | m_Prefixes |
vector< BYTE > | m_NPSs |
Protected Member Functions | |
virtual void | FilterSrc (string &src) const =0 |
string | GetRegistryString () const |
string | GetPath () const |
void | ReadOptions (string FileName) |
bool | LemmatizeWord (string &InputWordStr, const bool cap, const bool predict, vector< CAutomAnnotationInner > &results, bool bGetLemmaInfos) const |
void | AssignWeightIfNeed (vector< CAutomAnnotationInner > &FindResults) const |
bool | CheckAbbreviation (string InputWordStr, vector< CAutomAnnotationInner > &FindResults, bool is_cap) const |
CAutomAnnotationInner | ConvertPredictTupleToAnnot (const CPredictTuple &input) const |
void | PredictByDataBase (string InputWordStr, vector< CAutomAnnotationInner > &results, bool is_cap) const |
bool | IsPrefix (const string &Prefix) const |
Protected Member Functions inherited from CMorphDict | |
void | GetLemmaInfos (const string &Text, size_t TextPos, vector< CAutomAnnotationInner > &Infos) const |
void | CreateModelsIndex () |
Protected Attributes | |
string | m_Registry |
set< string > | m_HyphenPostfixes |
set< string > | m_HyphenPrefixes |
CStatistic | m_Statistic |
CPredictBase | m_Predict |
set< string > | m_PrefixesSet |
Protected Attributes inherited from CMorphDict | |
CMorphAutomat * | m_pFormAutomat |
vector< int > | m_ModelsIndex |
IsLessMorphInterp | m_SearchInfoLess |
CLemmatizer::CLemmatizer | ( | MorphLanguageEnum | Language | ) |
References CMorphDict::InitAutomat(), initIconv(), m_bAllowRussianJo, m_bLoaded, m_bMaximalPrediction, m_bUseStatistic, m_enc_ext, m_enc_int, and MorphAnnotChar.
|
virtual |
References m_ic_ext2int, and m_ic_int2ext.
|
protectedpure virtual |
Implemented in CLemmatizerGerman, CLemmatizerEnglish, and CLemmatizerRussian.
Referenced by CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), GetAllAncodesQuick(), and CLemmatizerRussian::~CLemmatizerRussian().
|
inlineprotected |
References AssignWeightIfNeed(), CheckAbbreviation(), ConvertPredictTupleToAnnot(), GetPath(), IsPrefix(), LemmatizeWord(), m_Registry, PredictByDataBase(), and ReadOptions().
Referenced by CLemmatizerGerman::CLemmatizerGerman(), and GetPath().
|
protected |
References GetRegistryString().
Referenced by GetRegistryString(), and LoadDictionariesRegistry().
|
protected |
References LoadFileToString(), m_bAllowRussianJo, Trim(), and StringTokenizer::val().
Referenced by GetRegistryString(), and LoadDictionariesRegistry().
|
protected |
References CMorphAutomat::GetInnerMorphInfos(), GetLanguage(), CMorphDict::GetLemmaInfos(), IsPrefix(), CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CMorphForm::m_FlexiaStr, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, CAutomAnnotationInner::m_ModelNo, CMorphDict::m_NPSs, CMorphDict::m_pFormAutomat, PredictByDataBase(), CMorphDict::PredictBySuffix(), RmlMakeUpper(), and UnknownPartOfSpeech.
Referenced by CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), GetAllAncodesQuick(), and GetRegistryString().
|
protected |
References CStatistic::get_HomoWeight(), CAutomAnnotationInner::GetParadigmId(), m_bUseStatistic, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_nWeight, and m_Statistic.
Referenced by CreateParadigmCollection(), and GetRegistryString().
|
protected |
References ConvertPredictTupleToAnnot(), CPredictBase::Find(), CABCEncoder::GetCriticalNounLetterPack(), GetLanguage(), is_upper_consonant(), CMorphDict::m_pFormAutomat, and m_Predict.
Referenced by GetRegistryString(), and PredictByDataBase().
|
protected |
References CPredictTuple::m_ItemNo, CAutomAnnotationInner::m_ItemNo, CPredictTuple::m_LemmaInfoNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, CAutomAnnotationInner::m_ModelNo, CAutomAnnotationInner::m_nWeight, and CAutomAnnotationInner::m_PrefixNo.
Referenced by CheckAbbreviation(), GetRegistryString(), and PredictByDataBase().
|
protected |
References CheckAbbreviation(), CheckABC(), ConvertPredictTupleToAnnot(), CPredictBase::Find(), CABCEncoder::GetCriticalNounLetterPack(), GetLanguage(), m_bMaximalPrediction, CMorphDict::m_LemmaInfos, CPredictBase::m_ModelFreq, CMorphDict::m_pFormAutomat, m_Predict, and morphGerman.
Referenced by GetRegistryString(), and LemmatizeWord().
|
protected |
References m_PrefixesSet.
Referenced by GetRegistryString(), and LemmatizeWord().
|
inline |
References CheckABC(), GetStatistic(), initIconv(), IsHyphenPostfix(), IsHyphenPrefix(), CABCEncoder::m_Language, and CMorphDict::m_pFormAutomat.
Referenced by CheckAbbreviation(), CFormInfo::GetAccentedVowel(), LemmatizeWord(), and PredictByDataBase().
const CStatistic & CLemmatizer::GetStatistic | ( | ) | const |
References m_Statistic.
Referenced by CFormInfo::GetHomonymWeightWithForm(), GetLanguage(), CFormInfo::GetWordWeight(), and CFormInfo::SetParadigmId().
bool CLemmatizer::CheckABC | ( | const string & | WordForm | ) | const |
References CABCEncoder::CheckABCWithoutAnnotator(), and CMorphDict::m_pFormAutomat.
Referenced by GetLanguage(), and PredictByDataBase().
bool CLemmatizer::IsHyphenPostfix | ( | const string & | Postfix | ) | const |
References m_HyphenPostfixes.
Referenced by CreateParadigmCollection(), and GetLanguage().
bool CLemmatizer::IsHyphenPrefix | ( | const string & | Prefix | ) | const |
References m_HyphenPrefixes.
Referenced by CreateParadigmCollection(), and GetLanguage().
bool CLemmatizer::initIconv | ( | const string & | enc_internal = "" , |
const string & | enc_external = "UTF8" |
||
) |
References ddcIconv::good(), m_enc_ext, m_enc_int, m_ic_ext2int, and m_ic_int2ext.
Referenced by CLemmatizer(), CLemmatizerGerman::CLemmatizerGerman(), and GetLanguage().
|
inline |
References ddcIconv::convert().
Referenced by TxMorph::expand(), and GetWordForms().
|
inline |
References ddcIconv::convert(), CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), GetAllAncodesQuick(), and LoadDictionariesRegistry().
Referenced by TxMorph::expand(), and GetWordForms().
bool CLemmatizer::LoadDictionariesRegistry | ( | string & | strError | ) |
References GetPath(), GetRmlVariable(), CStatistic::Load(), CPredictBase::Load(), CMorphDict::Load(), m_bLoaded, m_bUseStatistic, CMorphDict::m_FlexiaModels, CMorphDict::m_LemmaInfos, CPredictBase::m_ModelFreq, m_Predict, CMorphDict::m_Prefixes, m_PrefixesSet, m_Statistic, CExpc::m_strCause, MORPH_MAIN_FILES, OPTIONS_FILE, PREDICT_BIN_PATH, and ReadOptions().
Referenced by recode_int2ext().
bool CLemmatizer::CreateParadigmCollection | ( | bool | bNorm, |
string & | WordStr, | ||
bool | capital, | ||
bool | bUsePrediction, | ||
vector< CFormInfo > & | Result | ||
) | const |
References AssignWeightIfNeed(), CFormInfo::Create(), CreateDecartProduction(), FilterSrc(), IsFound(), IsHyphenPostfix(), IsHyphenPrefix(), LemmatizeWord(), and CAutomAnnotationInner::m_ItemNo.
Referenced by TxMorph::expand(), GetParadigmCollection(), GetWordForms(), and recode_int2ext().
void CLemmatizer::GetAllAncodesQuick | ( | const BYTE * | WordForm, |
bool | capital, | ||
BYTE * | OutBuffer, | ||
bool | bUsePrediction | ||
) | const |
References FilterSrc(), LemmatizeWord(), CLemmaInfo::m_CommonAncode, CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, and CAutomAnnotationInner::m_ModelNo.
Referenced by GetGramInfosFromWord(), and recode_int2ext().
bool CLemmatizer::GetAllAncodesAndLemmasQuick | ( | string & | InputWordStr, |
bool | capital, | ||
char * | OutBuffer, | ||
size_t | MaxBufferSize, | ||
bool | bUsePrediction | ||
) | const |
References FilterSrc(), LemmatizeWord(), CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CMorphForm::m_FlexiaStr, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_ModelNo, and CMorphForm::m_PrefixStr.
Referenced by recode_int2ext().
|
protected |
|
protected |
Referenced by CLemmatizerRussian::CLemmatizerRussian(), and IsHyphenPostfix().
|
protected |
Referenced by CLemmatizerRussian::CLemmatizerRussian(), and IsHyphenPrefix().
|
protected |
Referenced by AssignWeightIfNeed(), GetStatistic(), and LoadDictionariesRegistry().
|
protected |
Referenced by CheckAbbreviation(), LoadDictionariesRegistry(), and PredictByDataBase().
|
protected |
Referenced by IsPrefix(), and LoadDictionariesRegistry().
bool CLemmatizer::m_bLoaded |
Referenced by CLemmatizer(), and LoadDictionariesRegistry().
bool CLemmatizer::m_bMaximalPrediction |
Referenced by CLemmatizer(), and PredictByDataBase().
bool CLemmatizer::m_bUseStatistic |
Referenced by AssignWeightIfNeed(), CLemmatizer(), and LoadDictionariesRegistry().
bool CLemmatizer::m_bAllowRussianJo |
Referenced by CLemmatizer(), CLemmatizerRussian::FilterSrc(), and ReadOptions().
std::string CLemmatizer::m_enc_int |
internal encoding (default=""=none)
Referenced by CLemmatizer(), and initIconv().
std::string CLemmatizer::m_enc_ext |
external encoding (default=""=none)
Referenced by CLemmatizer(), and initIconv().
ddcIconv* CLemmatizer::m_ic_ext2int |
iconv converter from user encoding to morph-internal encoding
Referenced by initIconv(), and ~CLemmatizer().
ddcIconv* CLemmatizer::m_ic_int2ext |
iconv converter from morph-internal encoding to user encoding
Referenced by initIconv(), and ~CLemmatizer().