#include <Lemmatizers.h>
CLemmatizer::CLemmatizer | ( | MorphLanguageEnum | Language | ) |
References CMorphDict::InitAutomat(), initIconv(), m_bAllowRussianJo, m_bLoaded, m_bMaximalPrediction, m_bUseStatistic, m_enc_ext, m_enc_int, and MorphAnnotChar.
CLemmatizer::~CLemmatizer | ( | ) | [virtual] |
References m_ic_ext2int, and m_ic_int2ext.
virtual void CLemmatizer::FilterSrc | ( | string & | src | ) | const [protected, pure virtual] |
Implemented in CLemmatizerRussian, CLemmatizerEnglish, and CLemmatizerGerman.
Referenced by CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), and GetAllAncodesQuick().
string CLemmatizer::GetRegistryString | ( | ) | const [inline, protected] |
string CLemmatizer::GetPath | ( | ) | const [protected] |
References GetRegistryString().
Referenced by LoadDictionariesRegistry(), and LoadStatisticRegistry().
void CLemmatizer::ReadOptions | ( | string | FileName | ) | [protected] |
References LoadFileToString(), m_bAllowRussianJo, and Trim().
Referenced by LoadDictionariesRegistry().
bool CLemmatizer::LemmatizeWord | ( | string & | InputWordStr, | |
const bool | cap, | |||
const bool | predict, | |||
vector< CAutomAnnotationInner > & | results, | |||
bool | bGetLemmaInfos | |||
) | const [protected] |
References CMorphAutomat::GetInnerMorphInfos(), GetLanguage(), CMorphDict::GetLemmaInfos(), IsPrefix(), CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CMorphForm::m_FlexiaStr, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, CAutomAnnotationInner::m_ModelNo, CMorphDict::m_NPSs, CMorphDict::m_pFormAutomat, PredictByDataBase(), CMorphDict::PredictBySuffix(), RmlMakeUpper(), and UnknownPartOfSpeech.
Referenced by CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), GetAllAncodesQuick(), and ProcessHyphenWords().
void CLemmatizer::AssignWeightIfNeed | ( | vector< CAutomAnnotationInner > & | FindResults | ) | const [protected] |
References CStatistic::get_HomoWeight(), CAutomAnnotationInner::GetParadigmId(), m_bUseStatistic, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_nWeight, and m_Statistic.
Referenced by CreateParadigmCollection().
bool CLemmatizer::CheckAbbreviation | ( | string | InputWordStr, | |
vector< CAutomAnnotationInner > & | FindResults, | |||
bool | is_cap | |||
) | const [protected] |
References ConvertPredictTupleToAnnot(), CPredictBase::Find(), CABCEncoder::GetCriticalNounLetterPack(), GetLanguage(), is_upper_consonant(), CMorphDict::m_pFormAutomat, and m_Predict.
Referenced by PredictByDataBase().
CAutomAnnotationInner CLemmatizer::ConvertPredictTupleToAnnot | ( | const CPredictTuple & | input | ) | const [protected] |
References CPredictTuple::m_ItemNo, CAutomAnnotationInner::m_ItemNo, CPredictTuple::m_LemmaInfoNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, CAutomAnnotationInner::m_ModelNo, CAutomAnnotationInner::m_nWeight, and CAutomAnnotationInner::m_PrefixNo.
Referenced by CheckAbbreviation(), and PredictByDataBase().
void CLemmatizer::PredictByDataBase | ( | string | InputWordStr, | |
vector< CAutomAnnotationInner > & | results, | |||
bool | is_cap | |||
) | const [protected] |
References CheckAbbreviation(), CheckABC(), ConvertPredictTupleToAnnot(), CPredictBase::Find(), CABCEncoder::GetCriticalNounLetterPack(), GetLanguage(), m_bMaximalPrediction, CMorphDict::m_LemmaInfos, CPredictBase::m_ModelFreq, CMorphDict::m_pFormAutomat, m_Predict, and morphGerman.
Referenced by LemmatizeWord().
bool CLemmatizer::IsPrefix | ( | const string & | Prefix | ) | const [protected] |
References m_PrefixesSet.
Referenced by LemmatizeWord().
MorphLanguageEnum CLemmatizer::GetLanguage | ( | ) | const [inline] |
References CABCEncoder::m_Language, and CMorphDict::m_pFormAutomat.
Referenced by CheckAbbreviation(), CFormInfo::GetAccentedVowel(), LemmatizeWord(), PredictByDataBase(), ProcessHyphenWords(), and CPlmLineCollection::ProcessPlmLines().
const CStatistic & CLemmatizer::GetStatistic | ( | ) | const |
References m_Statistic.
Referenced by CFormInfo::GetHomonymWeightWithForm(), CFormInfo::GetWordWeight(), and CFormInfo::SetParadigmId().
bool CLemmatizer::CheckABC | ( | const string & | WordForm | ) | const |
References CABCEncoder::CheckABCWithoutAnnotator(), and CMorphDict::m_pFormAutomat.
Referenced by PredictByDataBase().
bool CLemmatizer::IsHyphenPostfix | ( | const string & | Postfix | ) | const |
References m_HyphenPostfixes.
Referenced by CreateParadigmCollection().
bool CLemmatizer::IsHyphenPrefix | ( | const string & | Prefix | ) | const |
References m_HyphenPrefixes.
Referenced by CreateParadigmCollection().
bool CLemmatizer::initIconv | ( | const string & | enc_internal = "" , |
|
const string & | enc_external = "UTF8" | |||
) |
References ddcIconv::good(), m_enc_ext, m_enc_int, m_ic_ext2int, and m_ic_int2ext.
Referenced by CLemmatizer(), and CLemmatizerGerman::CLemmatizerGerman().
std::string CLemmatizer::recode_ext2int | ( | const std::string | s_ext | ) | const [inline] |
References ddcIconv::convert(), and m_ic_ext2int.
Referenced by GetWordForms().
std::string CLemmatizer::recode_int2ext | ( | const std::string | s_int | ) | const [inline] |
References ddcIconv::convert(), and m_ic_int2ext.
Referenced by GetWordForms().
bool CLemmatizer::LoadDictionariesRegistry | ( | string & | strError | ) |
References GetPath(), GetRmlVariable(), CPredictBase::Load(), CStatistic::Load(), CMorphDict::Load(), m_bLoaded, m_bUseStatistic, CMorphDict::m_FlexiaModels, CMorphDict::m_LemmaInfos, CPredictBase::m_ModelFreq, m_Predict, CMorphDict::m_Prefixes, m_PrefixesSet, m_Statistic, CExpc::m_strCause, MORPH_MAIN_FILES, OPTIONS_FILE, PREDICT_BIN_PATH, and ReadOptions().
Referenced by CMorphologyHolder::LoadGraphanAndLemmatizer().
bool CLemmatizer::LoadStatisticRegistry | ( | SubjectEnum | subj | ) |
References GetPath(), CStatistic::Load(), m_Statistic, subjComputer, subjFinance, and subjLiterature.
bool CLemmatizer::CreateParadigmCollection | ( | bool | bNorm, | |
string & | WordStr, | |||
bool | capital, | |||
bool | bUsePrediction, | |||
vector< CFormInfo > & | Result | |||
) | const |
References AssignWeightIfNeed(), CFormInfo::Create(), CreateDecartProduction(), FilterSrc(), IsFound(), IsHyphenPostfix(), IsHyphenPrefix(), LemmatizeWord(), and CAutomAnnotationInner::m_ItemNo.
Referenced by GetParadigmCollection(), CMorphologyHolder::GetParadigmIdsByNormAndAncode(), GetWordForms(), CPlmLineCollection::ProcessPlmLines(), and CMorphologyHolder::string_to_ids().
void CLemmatizer::GetAllAncodesQuick | ( | const BYTE * | WordForm, | |
bool | capital, | |||
BYTE * | OutBuffer, | |||
bool | bUsePrediction | |||
) | const |
References FilterSrc(), LemmatizeWord(), CLemmaInfo::m_CommonAncode, CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_LemmaInfoNo, CMorphDict::m_LemmaInfos, and CAutomAnnotationInner::m_ModelNo.
Referenced by GetGramInfosFromWord().
bool CLemmatizer::GetAllAncodesAndLemmasQuick | ( | string & | InputWordStr, | |
bool | capital, | |||
char * | OutBuffer, | |||
size_t | MaxBufferSize, | |||
bool | bUsePrediction | |||
) | const |
References FilterSrc(), LemmatizeWord(), CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CMorphForm::m_FlexiaStr, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_ModelNo, and CMorphForm::m_PrefixStr.
References CFormInfo::AttachLemmatizer(), and CFormInfo::SetParadigmId().
Referenced by CMorphologyHolder::id_to_paradigm(), and CMorphologyHolder::id_to_string().
bool CLemmatizer::ProcessHyphenWords | ( | CGraphmatFile * | piGraphmatFile | ) | const |
References GetLanguage(), CUnitHolder::GetToken(), CGraphmatFile::GetTokenLanguage(), CUnitHolder::GetTokensCount(), CUnitHolder::GetUnits(), CUnitHolder::HasDescr(), LemmatizeWord(), CUnitHolder::MakeOneWord(), OHyp, OLw, OSentEnd, CUnitHolder::PSoft(), and CGraphmatFile::StartsFixedOborot().
Referenced by CMorphologyHolder::GetMorphology().
string CLemmatizer::m_Registry [protected] |
set<string> CLemmatizer::m_HyphenPostfixes [protected] |
Referenced by CLemmatizerRussian::CLemmatizerRussian(), and IsHyphenPostfix().
set<string> CLemmatizer::m_HyphenPrefixes [protected] |
Referenced by CLemmatizerRussian::CLemmatizerRussian(), and IsHyphenPrefix().
CStatistic CLemmatizer::m_Statistic [protected] |
Referenced by AssignWeightIfNeed(), GetStatistic(), LoadDictionariesRegistry(), and LoadStatisticRegistry().
CPredictBase CLemmatizer::m_Predict [protected] |
Referenced by CheckAbbreviation(), LoadDictionariesRegistry(), and PredictByDataBase().
set<string> CLemmatizer::m_PrefixesSet [protected] |
Referenced by IsPrefix(), and LoadDictionariesRegistry().
Referenced by CLemmatizer(), and LoadDictionariesRegistry().
Referenced by CLemmatizer(), and PredictByDataBase().
Referenced by AssignWeightIfNeed(), CLemmatizer(), and LoadDictionariesRegistry().
Referenced by CLemmatizer(), CLemmatizerRussian::FilterSrc(), and ReadOptions().
std::string CLemmatizer::m_enc_int |
internal encoding (default=""=none)
Referenced by CLemmatizer(), and initIconv().
std::string CLemmatizer::m_enc_ext |
external encoding (default=""=none)
Referenced by CLemmatizer(), and initIconv().
iconv converter from user encoding to morph-internal encoding
Referenced by initIconv(), recode_ext2int(), and ~CLemmatizer().
iconv converter from morph-internal encoding to user encoding
Referenced by initIconv(), recode_int2ext(), and ~CLemmatizer().