ddc
Public Member Functions | Public Attributes | Protected Member Functions | Protected Attributes | List of all members
CLemmatizer Class Referenceabstract

#include <Lemmatizers.h>

Inheritance diagram for CLemmatizer:
Inheritance graph
[legend]
Collaboration diagram for CLemmatizer:
Collaboration graph
[legend]

Public Member Functions

 CLemmatizer (MorphLanguageEnum Language)
 
virtual ~CLemmatizer ()
 
MorphLanguageEnum GetLanguage () const
 
const CStatisticGetStatistic () const
 
bool CheckABC (const string &WordForm) const
 
bool IsHyphenPostfix (const string &Postfix) const
 
bool IsHyphenPrefix (const string &Prefix) const
 
bool initIconv (const string &enc_internal="", const string &enc_external="UTF8")
 
std::string recode_ext2int (const std::string s_ext) const
 
std::string recode_int2ext (const std::string s_int) const
 
bool LoadDictionariesRegistry (string &strError)
 
bool CreateParadigmCollection (bool bNorm, string &WordStr, bool capital, bool bUsePrediction, vector< CFormInfo > &Result) const
 
void GetAllAncodesQuick (const BYTE *WordForm, bool capital, BYTE *OutBuffer, bool bUsePrediction) const
 
bool GetAllAncodesAndLemmasQuick (string &InputWordStr, bool capital, char *OutBuffer, size_t MaxBufferSize, bool bUsePrediction) const
 
- Public Member Functions inherited from CMorphDict
 CMorphDict (MorphLanguageEnum Language)
 
virtual ~CMorphDict ()
 
void InitAutomat (CMorphAutomat *pFormAutomat)
 
bool Load (string GrammarFileName)
 
bool Save (string GrammarFileName) const
 
void PredictBySuffix (const string &Text, size_t &TextOffset, size_t MinimalPredictSuffixlen, vector< CAutomAnnotationInner > &Infos) const
 
string GetAllMorphInterpsStr (const string &Text, const size_t TextPos, bool bFullInterp) const
 

Public Attributes

bool m_bLoaded
 
bool m_bMaximalPrediction
 
bool m_bUseStatistic
 
bool m_bAllowRussianJo
 
std::string m_enc_int
 internal encoding (default=""=none) More...
 
std::string m_enc_ext
 external encoding (default=""=none) More...
 
ddcIconvm_ic_ext2int
 iconv converter from user encoding to morph-internal encoding More...
 
ddcIconvm_ic_int2ext
 iconv converter from morph-internal encoding to user encoding More...
 
- Public Attributes inherited from CMorphDict
vector< CFlexiaModelm_FlexiaModels
 
vector< CAccentModelm_AccentModels
 
CShortStringHolder m_Bases
 
vector< CLemmaInfoAndLemmam_LemmaInfos
 
StringVector m_Prefixes
 
vector< BYTEm_NPSs
 

Protected Member Functions

virtual void FilterSrc (string &src) const =0
 
string GetRegistryString () const
 
string GetPath () const
 
void ReadOptions (string FileName)
 
bool LemmatizeWord (string &InputWordStr, const bool cap, const bool predict, vector< CAutomAnnotationInner > &results, bool bGetLemmaInfos) const
 
void AssignWeightIfNeed (vector< CAutomAnnotationInner > &FindResults) const
 
bool CheckAbbreviation (string InputWordStr, vector< CAutomAnnotationInner > &FindResults, bool is_cap) const
 
CAutomAnnotationInner ConvertPredictTupleToAnnot (const CPredictTuple &input) const
 
void PredictByDataBase (string InputWordStr, vector< CAutomAnnotationInner > &results, bool is_cap) const
 
bool IsPrefix (const string &Prefix) const
 
- Protected Member Functions inherited from CMorphDict
void GetLemmaInfos (const string &Text, size_t TextPos, vector< CAutomAnnotationInner > &Infos) const
 
void CreateModelsIndex ()
 

Protected Attributes

string m_Registry
 
set< string > m_HyphenPostfixes
 
set< string > m_HyphenPrefixes
 
CStatistic m_Statistic
 
CPredictBase m_Predict
 
set< string > m_PrefixesSet
 
- Protected Attributes inherited from CMorphDict
CMorphAutomatm_pFormAutomat
 
vector< int > m_ModelsIndex
 
IsLessMorphInterp m_SearchInfoLess
 

Constructor & Destructor Documentation

◆ CLemmatizer()

CLemmatizer::CLemmatizer ( MorphLanguageEnum  Language)

◆ ~CLemmatizer()

CLemmatizer::~CLemmatizer ( )
virtual

References m_ic_ext2int, and m_ic_int2ext.

Member Function Documentation

◆ FilterSrc()

virtual void CLemmatizer::FilterSrc ( string &  src) const
protectedpure virtual

◆ GetRegistryString()

string CLemmatizer::GetRegistryString ( ) const
inlineprotected

References AssignWeightIfNeed(), CheckAbbreviation(), ConvertPredictTupleToAnnot(), GetPath(), IsPrefix(), LemmatizeWord(), m_Registry, PredictByDataBase(), and ReadOptions().

Referenced by CLemmatizerGerman::CLemmatizerGerman(), and GetPath().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ GetPath()

string CLemmatizer::GetPath ( ) const
protected

References GetRegistryString().

Referenced by GetRegistryString(), and LoadDictionariesRegistry().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ ReadOptions()

void CLemmatizer::ReadOptions ( string  FileName)
protected

References LoadFileToString(), m_bAllowRussianJo, Trim(), and StringTokenizer::val().

Referenced by GetRegistryString(), and LoadDictionariesRegistry().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ LemmatizeWord()

bool CLemmatizer::LemmatizeWord ( string &  InputWordStr,
const bool  cap,
const bool  predict,
vector< CAutomAnnotationInner > &  results,
bool  bGetLemmaInfos 
) const
protected

◆ AssignWeightIfNeed()

void CLemmatizer::AssignWeightIfNeed ( vector< CAutomAnnotationInner > &  FindResults) const
protected

References CStatistic::get_HomoWeight(), CAutomAnnotationInner::GetParadigmId(), m_bUseStatistic, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_nWeight, and m_Statistic.

Referenced by CreateParadigmCollection(), and GetRegistryString().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ CheckAbbreviation()

bool CLemmatizer::CheckAbbreviation ( string  InputWordStr,
vector< CAutomAnnotationInner > &  FindResults,
bool  is_cap 
) const
protected

References ConvertPredictTupleToAnnot(), CPredictBase::Find(), CABCEncoder::GetCriticalNounLetterPack(), GetLanguage(), is_upper_consonant(), CMorphDict::m_pFormAutomat, and m_Predict.

Referenced by GetRegistryString(), and PredictByDataBase().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ ConvertPredictTupleToAnnot()

CAutomAnnotationInner CLemmatizer::ConvertPredictTupleToAnnot ( const CPredictTuple input) const
protected

◆ PredictByDataBase()

void CLemmatizer::PredictByDataBase ( string  InputWordStr,
vector< CAutomAnnotationInner > &  results,
bool  is_cap 
) const
protected

◆ IsPrefix()

bool CLemmatizer::IsPrefix ( const string &  Prefix) const
protected

References m_PrefixesSet.

Referenced by GetRegistryString(), and LemmatizeWord().

Here is the caller graph for this function:

◆ GetLanguage()

MorphLanguageEnum CLemmatizer::GetLanguage ( ) const
inline

References CheckABC(), GetStatistic(), initIconv(), IsHyphenPostfix(), IsHyphenPrefix(), CABCEncoder::m_Language, and CMorphDict::m_pFormAutomat.

Referenced by CheckAbbreviation(), CFormInfo::GetAccentedVowel(), LemmatizeWord(), and PredictByDataBase().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ GetStatistic()

const CStatistic & CLemmatizer::GetStatistic ( ) const

References m_Statistic.

Referenced by CFormInfo::GetHomonymWeightWithForm(), GetLanguage(), CFormInfo::GetWordWeight(), and CFormInfo::SetParadigmId().

Here is the caller graph for this function:

◆ CheckABC()

bool CLemmatizer::CheckABC ( const string &  WordForm) const

References CABCEncoder::CheckABCWithoutAnnotator(), and CMorphDict::m_pFormAutomat.

Referenced by GetLanguage(), and PredictByDataBase().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ IsHyphenPostfix()

bool CLemmatizer::IsHyphenPostfix ( const string &  Postfix) const

References m_HyphenPostfixes.

Referenced by CreateParadigmCollection(), and GetLanguage().

Here is the caller graph for this function:

◆ IsHyphenPrefix()

bool CLemmatizer::IsHyphenPrefix ( const string &  Prefix) const

References m_HyphenPrefixes.

Referenced by CreateParadigmCollection(), and GetLanguage().

Here is the caller graph for this function:

◆ initIconv()

bool CLemmatizer::initIconv ( const string &  enc_internal = "",
const string &  enc_external = "UTF8" 
)

References ddcIconv::good(), m_enc_ext, m_enc_int, m_ic_ext2int, and m_ic_int2ext.

Referenced by CLemmatizer(), CLemmatizerGerman::CLemmatizerGerman(), and GetLanguage().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ recode_ext2int()

std::string CLemmatizer::recode_ext2int ( const std::string  s_ext) const
inline

References ddcIconv::convert().

Referenced by TxMorph::expand(), and GetWordForms().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ recode_int2ext()

std::string CLemmatizer::recode_int2ext ( const std::string  s_int) const
inline

References ddcIconv::convert(), CreateParadigmCollection(), GetAllAncodesAndLemmasQuick(), GetAllAncodesQuick(), and LoadDictionariesRegistry().

Referenced by TxMorph::expand(), and GetWordForms().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ LoadDictionariesRegistry()

bool CLemmatizer::LoadDictionariesRegistry ( string &  strError)

◆ CreateParadigmCollection()

bool CLemmatizer::CreateParadigmCollection ( bool  bNorm,
string &  WordStr,
bool  capital,
bool  bUsePrediction,
vector< CFormInfo > &  Result 
) const

◆ GetAllAncodesQuick()

void CLemmatizer::GetAllAncodesQuick ( const BYTE WordForm,
bool  capital,
BYTE OutBuffer,
bool  bUsePrediction 
) const

◆ GetAllAncodesAndLemmasQuick()

bool CLemmatizer::GetAllAncodesAndLemmasQuick ( string &  InputWordStr,
bool  capital,
char *  OutBuffer,
size_t  MaxBufferSize,
bool  bUsePrediction 
) const

References FilterSrc(), LemmatizeWord(), CFlexiaModel::m_Flexia, CMorphDict::m_FlexiaModels, CMorphForm::m_FlexiaStr, CAutomAnnotationInner::m_ItemNo, CAutomAnnotationInner::m_ModelNo, and CMorphForm::m_PrefixStr.

Referenced by recode_int2ext().

Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ m_Registry

string CLemmatizer::m_Registry
protected

◆ m_HyphenPostfixes

set<string> CLemmatizer::m_HyphenPostfixes
protected

◆ m_HyphenPrefixes

set<string> CLemmatizer::m_HyphenPrefixes
protected

◆ m_Statistic

CStatistic CLemmatizer::m_Statistic
protected

◆ m_Predict

CPredictBase CLemmatizer::m_Predict
protected

◆ m_PrefixesSet

set<string> CLemmatizer::m_PrefixesSet
protected

◆ m_bLoaded

bool CLemmatizer::m_bLoaded

◆ m_bMaximalPrediction

bool CLemmatizer::m_bMaximalPrediction

Referenced by CLemmatizer(), and PredictByDataBase().

◆ m_bUseStatistic

bool CLemmatizer::m_bUseStatistic

◆ m_bAllowRussianJo

bool CLemmatizer::m_bAllowRussianJo

◆ m_enc_int

std::string CLemmatizer::m_enc_int

internal encoding (default=""=none)

Referenced by CLemmatizer(), and initIconv().

◆ m_enc_ext

std::string CLemmatizer::m_enc_ext

external encoding (default=""=none)

Referenced by CLemmatizer(), and initIconv().

◆ m_ic_ext2int

ddcIconv* CLemmatizer::m_ic_ext2int

iconv converter from user encoding to morph-internal encoding

Referenced by initIconv(), and ~CLemmatizer().

◆ m_ic_int2ext

ddcIconv* CLemmatizer::m_ic_int2ext

iconv converter from morph-internal encoding to user encoding

Referenced by initIconv(), and ~CLemmatizer().


The documentation for this class was generated from the following files: