#include <IndexSetForQueryingStage.h>
This class is a part of CStringIndexSet class which is used only during querying. It contains the important serialization primitives.
CIndexSetForQueryingStage::CIndexSetForQueryingStage | ( | const CStringIndexator * | pParent | ) |
References m_OccursFp, and m_pParent.
CIndexSetForQueryingStage::~CIndexSetForQueryingStage | ( | ) |
void CIndexSetForQueryingStage::CloseOccursFile | ( | ) | [private] |
close m_OccursFp
References m_OccursFp.
Referenced by DestroyIndexSet(), LoadIndexSet(), and ~CIndexSetForQueryingStage().
bool CIndexSetForQueryingStage::LoadPeriodDevision | ( | ) | [private] |
load all period divisions to m_EndPeriodOffsets
References FileSize(), GetPeriodsDevisionFileName(), CStringIndexator::GetSearchPeriodsCount(), m_EndPeriodOffsets, m_pParent, CExpc::m_strCause, and ReadVectorInner().
Referenced by LoadIndexSet().
void CIndexSetForQueryingStage::AssertHasPath | ( | ) | const [protected] |
return true, if the project path is initialized
References ErrorMessage(), errUnknownPath, CStringIndexator::m_Path, and m_pParent.
Referenced by DestroyIndexSet(), GetFileNameForInfos(), GetOccHdrFileName(), GetOccurrsFileSize(), GetOccursFileName(), and GetPeriodsDevisionFileName().
void CIndexSetForQueryingStage::AddOccurs | ( | size_t | IndexItemNo, | |
const bool | bOneOccurrence, | |||
const size_t | StartOccurNo, | |||
const size_t | EndOccurNo, | |||
vector< CTokenNo > & | Occurs, | |||
size_t | PeriodNo, | |||
COccurrBuffer & | OccursBuffer, | |||
CShortOccurCache * | pCacheByIndexSet, | |||
int & | CacheId | |||
) | const [protected] |
a function for reading occurrences for one index item
References CShortOccurCache::AddNewIndexItemNoToCache(), CShortOccurCache::CouldContainMore(), DearchiveOccurrences(), CShortOccurCache::GetOccurrencesFromCache(), CStringIndexator::GetSearchPeriod(), CStringIndexator::GetSearchPeriodsCount(), m_bCompressOccurrences, m_EndPeriodOffsets, m_OccursFp, m_pParent, MaxCompressRatio, OccurBufferSize, and ReadOccurrences().
Referenced by CStringIndexSet::FindChunkOccurrences(), CStringIndexSet::FindOccurrences(), CIndexSetForBigrams::FindOccurrencesForBigrams(), ReadAllOccurrences(), and CIndexSetForBigrams::ReadAllOccurrences().
string CIndexSetForQueryingStage::GetOccursFileName | ( | ) | const [protected] |
return the file name for the file occurrences
References AssertHasPath(), GetName(), CStringIndexator::m_Path, m_pParent, and MakeFName().
Referenced by CStringIndexSet::ConvertLoadIndexToWorkingIndex(), CIndexSetForBigrams::ConvertTempBigramsToPersistent(), DestroyIndexSet(), GetOccurrsFileSize(), LoadIndexSet(), and CStringIndexSet::UnionIndexSet().
string CIndexSetForQueryingStage::GetOccHdrFileName | ( | ) | const [protected] |
return the name of file for m_Index
References AssertHasPath(), GetName(), CStringIndexator::m_Path, m_pParent, and MakeFName().
Referenced by DestroyIndexSet(), LoadIndexSet(), and CStringIndexSet::WriteToFile().
string CIndexSetForQueryingStage::GetPeriodsDevisionFileName | ( | ) | const [protected] |
return the name of file for occurrences period division
References AssertHasPath(), GetName(), CStringIndexator::m_Path, m_pParent, and MakeFName().
Referenced by DestroyIndexSet(), LoadPeriodDevision(), and WritePeriodsDivision().
string CIndexSetForQueryingStage::GetFileNameForInfos | ( | ) | const [protected] |
return the name of file for CIndexSetForLoadingStage::m_StringBuffer
References AssertHasPath(), GetName(), CStringIndexator::m_Path, m_pParent, and MakeFName().
Referenced by DestroyIndexSet(), CStringIndexSet::ReadFromTheDisk(), and CStringIndexSet::WriteToFile().
file_off_t CIndexSetForQueryingStage::GetOccurrsFileSize | ( | ) | const [protected] |
return the size of the file for occurrences
References AssertHasPath(), FileSize(), and GetOccursFileName().
size_t CIndexSetForQueryingStage::GetStartOccurNo | ( | size_t | IndexNo | ) | const [protected] |
get the offset of the first occurrence of index item no IndexNo in the file of occurrences(m_OccursFp)
References m_Index.
Referenced by CStringIndexSet::FindChunkOccurrences(), CStringIndexSet::FindOccurrences(), and ReadAllOccurrences().
bool CIndexSetForQueryingStage::BuildPeriodsDivisionAndCompress | ( | const DWORD | TokenId, | |
vector< CTokenNo > & | InputTokens | |||
) | [protected] |
build a period division for one index item
References CompressPortion(), GetName(), CStringIndexator::GetSearchPeriod(), CStringIndexator::GetSearchPeriodsCount(), m_bCompressOccurrences, m_EndPeriodOffsets, m_pParent, and OccurBufferSize.
Referenced by AddOneIndexItem(), and CIndexSetForBigrams::WriteBigramAndItsOccurs().
bool CIndexSetForQueryingStage::AddOneIndexItem | ( | CItemIndexForLoading & | M, | |
FILE * | res_fp, | |||
size_t & | CurrPositionInResFile, | |||
const CTokenNo | EndTokeNo | |||
) | [protected] |
write one index item to result file
References CIndexItem::AddItemIndexFlags(), BuildPeriodsDivisionAndCompress(), CItemIndexForLoading::CheckOccurrences(), ErrorMessage(), Format(), CItemIndexForLoading::GetIndexItemOffset(), GetName(), CItemIndexForLoading::GetOccurs(), CItemIndexForLoading::GetOccursSize(), m_Index, CIndexItem::SetEndOccurOffset(), CIndexItem::SetIndexItemOffsetAndFlags(), TheOnlyOccurIsInEndOccurNo, and CItemIndexForLoading::WriteOccurrences().
Referenced by CStringIndexSet::ConvertLoadIndexToWorkingIndex(), and CStringIndexSet::UnionIndexSet().
bool CIndexSetForQueryingStage::WritePeriodsDivision | ( | ) | [protected] |
write index item's period division to disk
References GetPeriodsDevisionFileName(), CStringIndexator::GetSearchPeriodsCount(), m_EndPeriodOffsets, m_pParent, and WriteVectorInner().
Referenced by CStringIndexSet::ConvertLoadIndexToWorkingIndex(), CIndexSetForBigrams::ConvertTempBigramsToPersistent(), and CStringIndexSet::UnionIndexSet().
virtual string CIndexSetForQueryingStage::GetName | ( | ) | const [pure virtual] |
return the name of the index (CStringIndexSet::m_Name)
Implemented in CIndexSetForBigrams, and CStringIndexSet.
Referenced by AddOneIndexItem(), BuildPeriodsDivisionAndCompress(), GetFileNameForInfos(), GetOccHdrFileName(), GetOccursFileName(), and GetPeriodsDevisionFileName().
bool CIndexSetForQueryingStage::LoadIndexSet | ( | bool | bLoadHeaderOfOccurrences = true |
) |
load index set from binaries
References CloseOccursFile(), GetOccHdrFileName(), GetOccursFileName(), LoadPeriodDevision(), m_Index, m_OccursFp, and ReadVector().
Referenced by CConcIndexator::CreateMorphIndex(), and CStringIndexSet::ReadFromTheDisk().
bool CIndexSetForQueryingStage::DestroyIndexSet | ( | ) |
destroy index set and remove index files
Reimplemented in CIndexSetForBigrams, and CStringIndexSet.
References AssertHasPath(), ClearVector(), CloseOccursFile(), FileExists(), GetFileNameForInfos(), GetOccHdrFileName(), GetOccursFileName(), GetPeriodsDevisionFileName(), m_EndPeriodOffsets, and m_Index.
Referenced by CStringIndexSet::DestroyIndexSet(), and CIndexSetForBigrams::DestroyIndexSet().
void CIndexSetForQueryingStage::ReadAllOccurrences | ( | size_t | IndexItemNo, | |
vector< CTokenNo > & | Occurs | |||
) | const |
reads all occurrences of IndexItemNo (this function can allocate much memory; it should be used carefully)
References AddOccurs(), CStringIndexator::GetSearchPeriodsCount(), GetStartOccurNo(), m_Index, and m_pParent.
Referenced by CConcIndexator::CreateMorphIndex(), and CStringIndexSet::UnionIndexSet().
FILE* CIndexSetForQueryingStage::m_OccursFp [private] |
the main file of occurrences
Referenced by AddOccurs(), CIndexSetForQueryingStage(), CloseOccursFile(), and LoadIndexSet().
the main index(from strings to the ordered list of their occurrences)
Referenced by AddOneIndexItem(), CStringIndexSet::ConvertLoadIndexToWorkingIndex(), CStringIndexSet::ConvertTempStorageToPersistent(), CConcIndexator::CreateMorphIndex(), DestroyIndexSet(), CStringIndexSet::DumpBigramsOfOneDirection(), CStringIndexSet::DumpStorage(), CStringIndexSet::FindChunkOccurrences(), CStringIndexSet::FindOccurrences(), CConcHolder::GetHitIds(), GetStartOccurNo(), CStringIndexSet::GetTokensFromStorage(), LoadIndexSet(), CStringIndexSet::QueryTokenList(), CStringIndexSet::QueryTokenListUsingRegExp(), CStringIndexSet::QueryTokenListWithRightTruncation(), ReadAllOccurrences(), CStringIndexSet::UnionIndexSet(), and CStringIndexSet::WriteToFile().
all corpus period divisions for the long occurrence lists
Referenced by AddOccurs(), BuildPeriodsDivisionAndCompress(), CStringIndexSet::ConvertLoadIndexToWorkingIndex(), CIndexSetForBigrams::CreateRightBigrams(), DestroyIndexSet(), LoadPeriodDevision(), and WritePeriodsDivision().
a pointer to the collection of indices, which contains a reference to this index
Referenced by AddOccurs(), AssertHasPath(), BuildPeriodsDivisionAndCompress(), CIndexSetForQueryingStage(), CStringIndexSet::ConvertLoadIndexToWorkingIndex(), GetFileNameForInfos(), CStringIndexSet::GetLeftBigramsFileName(), GetOccHdrFileName(), GetOccursFileName(), GetPeriodsDevisionFileName(), CIndexSetForBigrams::GetRightToLeftPerdiv(), CStringIndexSet::GetStorageFileName(), LoadPeriodDevision(), CStringIndexSet::QueryTokenList(), CStringIndexSet::QueryTokenListUsingRegExp(), CStringIndexSet::QueryTokenListWithRightTruncation(), ReadAllOccurrences(), CIndexSetForBigrams::ReadAllOccurrences(), and WritePeriodsDivision().
if true, then the occurrences should be compresses (up to 30% for huge corpora)
Referenced by AddOccurs(), BuildPeriodsDivisionAndCompress(), CIndexSetForBigrams::CIndexSetForBigrams(), and CStringIndexSet::InitIndexSet().