#include <IndexSet.h>
Class CStringIndexSet is the upmost implementation of one index set. The main functions deals with the searching of strings in index and with the retrieving its occurrences. On the other hand, this class is an inheritor of CIndexSetForLoadingStage and CIndexSetForQueryingStage, and therefore it provides a connection between them during the load phase (for example CStringIndexSet::ConvertLoadIndexToWorkingIndex).
CStringIndexSet::CStringIndexSet | ( | const CStringIndexator * | pParent | ) |
References m_StorageFile.
CStringIndexSet::~CStringIndexSet | ( | ) |
bool CStringIndexSet::ConvertLoadIndexToWorkingIndex | ( | ) | [private] |
convert a temporal index set to the work index
References CIndexSetForQueryingStage::AddOneIndexItem(), CItemIndexForLoading::FreeOccurs(), CIndexSetForQueryingStage::GetOccursFileName(), CStringIndexator::GetSearchPeriod(), CStringIndexator::GetSearchPeriodsCount(), CItemIndexForLoading::InitOccurs(), CIndexSetForQueryingStage::m_EndPeriodOffsets, CIndexSetForQueryingStage::m_Index, CIndexSetForLoadingStage::m_MainOccurTempFileName, CIndexSetForQueryingStage::m_pParent, CItemIndexForLoading::ReadFromTemporalFile(), and CIndexSetForQueryingStage::WritePeriodsDivision().
Referenced by WriteToFile().
string CStringIndexSet::GetStorageFileName | ( | ) | const [private] |
return file name for storage
References GetName(), CStringIndexator::m_Path, CIndexSetForQueryingStage::m_pParent, and MakeFName().
Referenced by ConvertTempStorageToPersistent(), CreateUnionTokenStorage(), DestroyIndexSet(), and OpenStorageFile().
string CStringIndexSet::GetLeftBigramsFileName | ( | ) | const [private] |
return file name for left bigrams
References GetName(), CStringIndexator::m_Path, CIndexSetForQueryingStage::m_pParent, and MakeFName().
bool CStringIndexSet::CreateUnionTokenStorage | ( | const CStringIndexSet & | I1, | |
const CStringIndexSet & | I2, | |||
const map< DWORD, DWORD > & | First2Result, | |||
const map< DWORD, DWORD > & | Second2Result | |||
) | [private] |
make concatenation of two storages
References CloseStorageFile(), GetStorageFileName(), CIndexSetForLoadingStage::m_bUseItemStorage, OpenStorageFile(), and SaveOnePartOfUnionTokenStorage().
Referenced by UnionIndexSet().
bool CStringIndexSet::SaveOnePartOfUnionTokenStorage | ( | FILE * | res_fp, | |
const map< DWORD, DWORD > & | Old2New | |||
) | const [private] |
save one part of token storage to the common file (called from CreateUnionTokenStorage)
References FSeek(), and m_StorageFile.
Referenced by CreateUnionTokenStorage().
bool CStringIndexSet::OpenStorageFile | ( | ) | [private] |
open storage file
References CloseStorageFile(), GetStorageFileName(), and m_StorageFile.
Referenced by CreateUnionTokenStorage(), ReadFromTheDisk(), and WriteToFile().
void CStringIndexSet::CloseStorageFile | ( | ) | [private] |
close storage file
References m_StorageFile.
Referenced by CreateUnionTokenStorage(), DestroyIndexSet(), OpenStorageFile(), and ~CStringIndexSet().
string CStringIndexSet::GetName | ( | ) | const [private, virtual] |
return m_Name (an implementation of pure member CIndexSetForLoadingStage::GetName )
Implements CIndexSetForQueryingStage.
References m_Name.
Referenced by FindChunkOccurrences(), FindOccurrences(), GetLeftBigramsFileName(), and GetStorageFileName().
bool CStringIndexSet::ConvertTempStorageToPersistent | ( | ) | [private] |
converts temporary index storage to persistent one (replacing a reference to m_StringBuffer by a index item no)
References GetStorageFileName(), CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForQueryingStage::m_Index, CIndexSetForLoadingStage::m_TempStorageFile, and CIndexSetForLoadingStage::m_TempStorageFileName.
Referenced by WriteToFile().
bool CStringIndexSet::DumpBigramsOfOneDirection | ( | BigramDirectionEnum | bigram_direc | ) | const [private] |
References bdLeftBigram, GetIndexItemStr(), m_BigramsIndex, CIndexSetForLoadingStage::m_bUseItemStorage, CSecondWord::m_Distance, CSecondWord::m_EndOfSentenceFlag, CIndexBigram::m_FirstWords, CIndexSetForQueryingStage::m_Index, CIndexSetForBigrams::m_LeftBigrams, CIndexSetForBigrams::m_RightBigrams, CSecondWord::m_SecondWordId, CIndexSetForBigrams::ReadAllOccurrences(), and CIndexBigram::ReadSecondWords().
Referenced by DumpBigrams().
const char* CStringIndexSet::GetIndexItemStr | ( | const T & | W | ) | const [inline] |
this function returns an string(which was indexed by DDC) by an instance of CIndexItem or CItemIndexForLoading
References CIndexSetForLoadingStage::m_StringBuffer.
Referenced by CConcIndexator::CreateMorphIndex(), DumpBigramsOfOneDirection(), DumpStorage(), FindChunkOccurrences(), FindOccurrences(), CConcHolder::GetHitIds(), GetTokensFromStorage(), QueryTokenList(), QueryTokenListUsingRegExp(), QueryTokenListWithRightTruncation(), and UnionIndexSet().
void CStringIndexSet::InitIndexSet | ( | string | Name, | |
string | ShortName, | |||
bool | bCreateItemStorage, | |||
bool | bCompress | |||
) |
initialize all class slots
References CIndexSetForQueryingStage::m_bCompressOccurrences, CIndexSetForLoadingStage::m_bUseItemStorage, m_Name, and m_ShortName.
Referenced by CStringIndexator::RegisterChunkIndex(), and CStringIndexator::RegisterStringIndices().
bool CStringIndexSet::ReadFromTheDisk | ( | ) |
read index from the disk
References CIndexSetForQueryingStage::GetFileNameForInfos(), CIndexSetForBigrams::LoadIndexSet(), CIndexSetForQueryingStage::LoadIndexSet(), m_BigramsIndex, CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForLoadingStage::m_StringBuffer, OpenStorageFile(), ReadVector(), and CIndexSetForLoadingStage::UseBigrams().
bool CStringIndexSet::DestroyIndexSet | ( | ) |
clear all vectors of the index and removes index files
Reimplemented from CIndexSetForQueryingStage.
References ClearVector(), CloseStorageFile(), CIndexSetForBigrams::DestroyIndexSet(), CIndexSetForQueryingStage::DestroyIndexSet(), FileExists(), GetStorageFileName(), m_BigramsIndex, CIndexSetForLoadingStage::m_StringBuffer, and CIndexSetForLoadingStage::UseBigrams().
Referenced by CConcIndexator::CreateMorphIndex().
bool CStringIndexSet::WriteToFile | ( | bool | bAfterLoading | ) |
write index to file
References CIndexSetForLoadingStage::CloseTempBigramsFile(), ConvertLoadIndexToWorkingIndex(), CIndexSetForBigrams::ConvertTempBigramsToPersistent(), ConvertTempStorageToPersistent(), CIndexSetForLoadingStage::DeleteTempFiles(), CIndexSetForQueryingStage::GetFileNameForInfos(), CIndexSetForQueryingStage::GetOccHdrFileName(), CIndexSetForBigrams::LoadIndexSet(), m_BigramsIndex, CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForQueryingStage::m_Index, CIndexSetForLoadingStage::m_StringBuffer, CIndexSetForLoadingStage::m_TempBigramsFileName, OpenStorageFile(), CIndexSetForLoadingStage::UseBigrams(), and WriteVector().
Referenced by CConcIndexator::CreateMorphIndex().
void CStringIndexSet::UnionIndexSet | ( | const CStringIndexSet & | I1, | |
const CStringIndexSet & | I2, | |||
const CTokenNo | EndToken1, | |||
const CTokenNo | EndToken2 | |||
) |
build union of two indices , throws CExpc if failes
References CIndexSetForLoadingStage::AddItemStrToBuffer(), CIndexSetForQueryingStage::AddOneIndexItem(), CreateUnionTokenStorage(), CItemIndexForLoading::FreeOccurs(), GetIndexItemStr(), CIndexItem::GetMaximalNumberOfRunningTokens(), CItemIndexForLoading::GetOccurs(), CIndexSetForQueryingStage::GetOccursFileName(), CItemIndexForLoading::GetOccursSize(), LessIndexString2< IndexType >::Greater(), CItemIndexForLoading::InitOccurs(), CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForQueryingStage::m_Index, CIndexSetForLoadingStage::m_StringBuffer, CIndexSetForQueryingStage::ReadAllOccurrences(), CItemIndexForLoading::SetIndexItemOffset(), and CIndexSetForQueryingStage::WritePeriodsDivision().
bool CStringIndexSet::GetTokensFromStorage | ( | const size_t | start_offset, | |
const size_t | end_offset, | |||
vector< COutputToken > & | Tokens | |||
) | const |
return sequence of tokens(strings) [start_offset, end_offset]
References FSeek(), GetIndexItemStr(), CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForQueryingStage::m_Index, and m_StorageFile.
Referenced by CConcHolder::GetFileSnippets(), CConcHolder::GetTokensFromStorageByBreak(), and CConcHolder::SaveOccurrences().
void CStringIndexSet::FindOccurrences | ( | const vector< DWORD > & | IndexItems, | |
const size_t | PeriodNo, | |||
vector< CTokenNo > & | occurrences, | |||
CMyTimeSpanHolder & | Profilerp, | |||
CShortOccurCacheMap * | pCaches, | |||
vector< int > & | CacheIds | |||
) | const |
find all occurrences of index items in subcorpora PeriodNo, using cache pCaches
References CIndexSetForQueryingStage::AddOccurs(), CIndexItem::GetEndOccurOffset(), GetIndexItemStr(), GetName(), CIndexSetForQueryingStage::GetStartOccurNo(), CIndexItem::HasOneOccurrence(), CIndexSetForQueryingStage::m_Index, and SortOccurrences().
Referenced by CQueryTokenNode::EvaluateWithoutHits().
void CStringIndexSet::FindChunkOccurrences | ( | const vector< DWORD > & | IndexItems, | |
vector< CTokenNo > & | occurrences, | |||
vector< size_t > & | ChunkLengths, | |||
size_t | PeriodNo, | |||
CMyTimeSpanHolder & | Profilerp, | |||
CShortOccurCacheMap * | pCaches, | |||
vector< int > & | CacheIds | |||
) | const |
find all occurrences of index items in subcorpora PeriodNo, using cache pCaches (if occurrences are written by chunks)
References CIndexSetForQueryingStage::AddOccurs(), GetIndexItemStr(), GetName(), CIndexSetForQueryingStage::GetStartOccurNo(), and CIndexSetForQueryingStage::m_Index.
Referenced by CQueryTokenNode::EvaluateWithoutHits().
void CStringIndexSet::QueryTokenList | ( | const string & | WordForm, | |
vector< DWORD > & | MatchWords | |||
) | const |
search for a string "WordForm", and add it to "MatchWords", if it is found
References GetIndexItemStr(), CIndexSetForQueryingStage::m_Index, CStringIndexator::m_MaxRegExpExpansionSize, CIndexSetForQueryingStage::m_pParent, and CIndexSetForLoadingStage::m_StringBuffer.
Referenced by CQueryTokenNode::CreateFileList(), CQueryTokenNode::CreateNodeByIndexName(), and CQueryTokenNode::CreateTokenPattern().
void CStringIndexSet::QueryTokenListWithRightTruncation | ( | const string & | WordForm, | |
vector< DWORD > & | MatchWords | |||
) | const |
search for all strings, which start from "WordForm", and add them to "MatchWords"
References GetIndexItemStr(), CIndexSetForQueryingStage::m_Index, CStringIndexator::m_MaxRegExpExpansionSize, CIndexSetForQueryingStage::m_pParent, and CIndexSetForLoadingStage::m_StringBuffer.
Referenced by CQueryTokenNode::CreateTokenPattern().
void CStringIndexSet::QueryTokenListUsingRegExp | ( | RML_RE & | RegExp, | |
vector< DWORD > & | MatchWords | |||
) | const |
search for all index items, which satisfy regular expession "RegExp", and add them to "MatchWords"
References GetIndexItemStr(), CIndexSetForQueryingStage::m_Index, CStringIndexator::m_MaxRegExpExpansionSize, CIndexSetForQueryingStage::m_pParent, and RML_RE::PartialMatch().
Referenced by CQueryTokenNode::BuildRegExp().
bool CStringIndexSet::DumpStorage | ( | ) | const |
print the string representation of the whole storage to stdout
References FSeek(), GetIndexItemStr(), CIndexSetForLoadingStage::m_bUseItemStorage, CIndexSetForQueryingStage::m_Index, and m_StorageFile.
bool CStringIndexSet::DumpBigrams | ( | ) | const |
print bigrams
References bdLeftBigram, bdRightBigram, DumpBigramsOfOneDirection(), and CExpc::m_strCause.
FILE* CStringIndexSet::m_StorageFile [private] |
a file for index storage
Referenced by CloseStorageFile(), CStringIndexSet(), DumpStorage(), GetTokensFromStorage(), OpenStorageFile(), and SaveOnePartOfUnionTokenStorage().
string CStringIndexSet::m_Name |
the main name of the index set, for example "Token", "MorphPattern", "Thes", "Chunk"...
Referenced by CQueryTokenNode::CreateNodeByIndexName(), GetName(), and InitIndexSet().
string CStringIndexSet::m_ShortName |
a short name of the index set, for example "m", "w", "t", "c"
Referenced by CConcHolder::BuildJsonContextString(), and InitIndexSet().