21 #ifndef IndexSetForQueryingStage_h 22 #define IndexSetForQueryingStage_h 25 #include "../CommonLib/bserialize.h" 26 #include "../CommonLib/ddcThread.h" 27 #include "../CommonLib/ddcMMap.h" 69 m_IndexItemOffsetAndFlags = Value;
78 return m_IndexItemOffsetAndFlags & ~
AllFlags;
83 m_IndexItemOffsetAndFlags &=
AllFlags;
84 m_IndexItemOffsetAndFlags |= Value & ~
AllFlags;
89 return m_IndexItemOffsetAndFlags &
AllFlags;
94 m_IndexItemOffsetAndFlags |= Value &
AllFlags;
104 m_EndOccurOffset = EndOccurOffset;
159 bool LoadPeriodDivision();
163 void ReadOccurrences(
CTokenNo *OutBuffer,
file_off_t FilePosition,
size_t Count)
const;
168 void AssertHasPath()
const;
171 void AddOccurs(
size_t IndexItemNo,
const bool bOneOccurrence,
const size_t StartOccurNo,
const size_t EndOccurNo,
172 vector<CTokenNo> &Occurs,
size_t PeriodNo,
COccurrBuffer &OccursBuffer,
176 string GetOccursFileName()
const;
179 string GetOccHdrFileName()
const;
182 string GetSuffixFileName()
const;
186 string GetPeriodsDivisionFileName()
const;
189 string GetFileNameForInfos()
const;
195 size_t GetStartOccurNo(
size_t IndexNo)
const;
198 bool BuildPeriodsDivisionAndCompress(
const DWORD TokenId, vector<CTokenNo> &InputTokens);
205 bool WritePeriodsDivision();
208 bool LoadIndexSet(
bool bLoadHeaderOfOccurrences =
true);
233 virtual string GetName()
const = 0;
236 bool DestroyIndexSet();
239 void ReadAllOccurrences(
size_t IndexItemNo, vector<CTokenNo> &Occurs)
const;
vector< CTokenNo > COccurrBuffer
a type for holding occurrences during reading from the disk
Definition: ConcCommon.h:480
map< DWORD, VecT > MapT
Definition: ddcMMap.h:591
void SetIndexItemOffset(DWORD Value)
sets a reference of this index item to CStringIndexSet::m_StringBuffer
Definition: IndexSetForQueryingStage.h:82
ddcVecFile< CIndexItem > m_Index
the main index(from strings to the ordered list of their occurrences)
Definition: IndexSetForQueryingStage.h:213
QWORD file_off_t
Definition: utilit.h:179
ddcVecFile< DWORD > CSuffixIndex
Definition: IndexSetForQueryingStage.h:140
Definition: StringIndexator.h:121
void SetEndOccurOffset(DWORD EndOccurOffset)
set the upper bound offset of occurrences in CIndexSetForQueryingStage::m_OccursFp ...
Definition: IndexSetForQueryingStage.h:103
Definition: ddcMMap.h:733
bool HasOneOccurrence() const
Definition: IndexSetForQueryingStage.h:107
static size_t GetMaximalNumberOfRunningTokens()
Definition: IndexSetForQueryingStage.h:72
size_t save_to_bytes(const CIndexItem &i, BYTE *buf)
Definition: IndexSetForQueryingStage.h:116
DWORD m_EndOccurOffset
Definition: IndexSetForQueryingStage.h:59
Definition: ConcCommon.h:85
DWORD m_IndexItemOffsetAndFlags
Definition: IndexSetForQueryingStage.h:56
void AddItemIndexFlags(DWORD Value)
set flags of this index item
Definition: IndexSetForQueryingStage.h:93
const CStringIndexator * m_pParent
a pointer to the collection of indices, which contains a reference to this index
Definition: IndexSetForQueryingStage.h:222
DWORD GetIndexItemOffset() const
returns a reference of this index item to CStringIndexSet::m_StringBuffer
Definition: IndexSetForQueryingStage.h:77
PeriodsDivisionMapT::MapT PeriodsDivisionMapR
low-level resident std::map<> implementation in PeriodsDivisionMap::m_map
Definition: IndexSetForQueryingStage.h:35
size_t restore_from_bytes(CIndexItem &i, const BYTE *buf)
Definition: IndexSetForQueryingStage.h:122
ddcMapFile< DWORD, DWORD > PeriodsDivisionMapT
a type for mappping an index item no to its period division
Definition: IndexSetForQueryingStage.h:34
const DWORD TheOnlyOccurIsInEndOccurNo
Definition: IndexSetForQueryingStage.h:43
DWORD GetItemIndexFlags() const
returns flags of this index item
Definition: IndexSetForQueryingStage.h:88
DWORD GetEndOccurOffset() const
return the upper bound offset of occurrences in CIndexSetForQueryingStage::m_OccursFp ...
Definition: IndexSetForQueryingStage.h:98
Definition: IndexSetForQueryingStage.h:54
DWORD GetIndexItemOffsetAndFlags() const
return m_IndexItemOffsetAndFlags for serialization
Definition: IndexSetForQueryingStage.h:63
void SetIndexItemOffsetAndFlags(DWORD Value)
set m_IndexItemOffsetAndFlags for serialization
Definition: IndexSetForQueryingStage.h:68
const DWORD AllFlags
Definition: IndexSetForQueryingStage.h:47
bool m_bCompressOccurrences
if true, then the occurrences should be compresses (up to 30% for huge corpora)
Definition: IndexSetForQueryingStage.h:225
PeriodsDivisionMapT m_EndPeriodOffsets
all corpus period divisions for the long occurrence lists
Definition: IndexSetForQueryingStage.h:219
unsigned char BYTE
Definition: utilit.h:94
size_t get_size_in_bytes(const CIndexItem &t)
Definition: IndexSetForQueryingStage.h:112
CItemIndexForLoading holds all occurrences of one index item (token, morph. pattern...) on the index stage.
Definition: IndexSetForLoadingStage.h:46
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
uint32_t DWORD
Definition: utilit.h:105
Definition: IndexSetForQueryingStage.h:154
Definition: ddcMMap.h:382
ddcFileOrMMap m_OccursFp
the main file of occurrences
Definition: IndexSetForQueryingStage.h:156
CSuffixIndex m_rIndex
optional auxiliary index for suffix-queries; ItemIds lexicographically sorted by reverse string-value...
Definition: IndexSetForQueryingStage.h:216
PeriodsDivisionMapT::RecFileT PeriodsDivisionMapV
low-level virtual mmap()-based implementation in PeriodsDivisionMap::m_rfile
Definition: IndexSetForQueryingStage.h:36