ddc
IndexSetForQueryingStage.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2018 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 #ifndef IndexSetForQueryingStage_h
22 #define IndexSetForQueryingStage_h
23 
24 
25 #include "../CommonLib/bserialize.h"
26 #include "../CommonLib/ddcThread.h"
27 #include "../CommonLib/ddcMMap.h"
28 
30 // PeriodsDivisionMap : type for mappping an index item no to its \ref perdiv_def "period division"
31 
33 //typedef map<DWORD, vector<DWORD> > PeriodsDivisionMap__OLD
37 
38 
39 
41 // Flags for CIndexItem::m_IndexItemOffsetAndFlags
42 
44 /*-- 2018-08-15: EndOfSentenceForBigrams flag is obsolete --*/
45 //const DWORD EndOfSentenceForBigrams = 1<<30;
46 //const DWORD AllFlags = TheOnlyOccurIsInEndOccurNo | EndOfSentenceForBigrams;
48 
54 class CIndexItem {
55  // pointer to CStringIndexSet::m_StringBuffer and flags;
57 
58  // upper bound offset of occurrences in CIndexSetForQueryingStage::m_OccursFp
60 
61 public:
65  };
66 
69  m_IndexItemOffsetAndFlags = Value;
70  };
71 
73  return ~0 & ~AllFlags;
74  };
75 
78  return m_IndexItemOffsetAndFlags & ~AllFlags;
79  };
80 
82  void SetIndexItemOffset(DWORD Value) {
83  m_IndexItemOffsetAndFlags &= AllFlags;
84  m_IndexItemOffsetAndFlags |= Value & ~AllFlags;
85  };
86 
89  return m_IndexItemOffsetAndFlags & AllFlags;
90  };
91 
93  void AddItemIndexFlags(DWORD Value) {
94  m_IndexItemOffsetAndFlags |= Value & AllFlags;
95  };
96 
99  return m_EndOccurOffset;
100  };
101 
103  void SetEndOccurOffset(DWORD EndOccurOffset) {
104  m_EndOccurOffset = EndOccurOffset;
105  };
106 
107  bool HasOneOccurrence() const {
108  return (m_IndexItemOffsetAndFlags & TheOnlyOccurIsInEndOccurNo) != 0;
109  }
110 };
111 
112 inline size_t get_size_in_bytes(const CIndexItem &t) {
113  return 8;
114 };
115 
116 inline size_t save_to_bytes(const CIndexItem &i, BYTE *buf) {
117  buf += save_to_bytes(i.GetIndexItemOffsetAndFlags(), buf);
118  buf += save_to_bytes(i.GetEndOccurOffset(), buf);
119  return get_size_in_bytes(i);
120 }
121 
122 inline size_t restore_from_bytes(CIndexItem &i, const BYTE *buf) {
123  DWORD d;
124 
125  buf += restore_from_bytes(d, buf);
127 
128  buf += restore_from_bytes(d, buf);
129  i.SetEndOccurOffset(d);
130 
131  return get_size_in_bytes(i);
132 }
133 
135 // suffix-index
136 
141 
142 
144 // forward decls
145 
146 class CStringIndexator;
147 
149 
151 
157 
159  bool LoadPeriodDivision();
160 
163  void ReadOccurrences(CTokenNo *OutBuffer, file_off_t FilePosition, size_t Count) const;
164 
165 protected:
166 
168  void AssertHasPath() const;
169 
171  void AddOccurs(size_t IndexItemNo, const bool bOneOccurrence, const size_t StartOccurNo, const size_t EndOccurNo,
172  vector<CTokenNo> &Occurs, size_t PeriodNo, COccurrBuffer &OccursBuffer,
173  CShortOccurCache *pCacheByIndexSet, int &CacheId) const;
174 
176  string GetOccursFileName() const;
177 
179  string GetOccHdrFileName() const;
180 
182  string GetSuffixFileName() const;
183 
184 
186  string GetPeriodsDivisionFileName() const;
187 
189  string GetFileNameForInfos() const;
190 
192  file_off_t GetOccurrsFileSize() const;
193 
195  size_t GetStartOccurNo(size_t IndexNo) const;
196 
198  bool BuildPeriodsDivisionAndCompress(const DWORD TokenId, vector<CTokenNo> &InputTokens);
199 
201  bool
202  AddOneIndexItem(CItemIndexForLoading &M, FILE *res_fp, size_t &CurrPositionInResFile, const CTokenNo EndTokeNo);
203 
205  bool WritePeriodsDivision();
206 
208  bool LoadIndexSet(bool bLoadHeaderOfOccurrences = true);
209 
210 
211 public:
214 
217 
220 
223 
226 
227 
229 
230  virtual ~CIndexSetForQueryingStage();
231 
233  virtual string GetName() const = 0;
234 
236  bool DestroyIndexSet();
237 
239  void ReadAllOccurrences(size_t IndexItemNo, vector<CTokenNo> &Occurs) const;
240 
241 };
242 
243 
244 #endif
245 
246 /*--- emacs style variables ---
247  * Local Variables:
248  * mode: C++
249  * c-file-style: "ellemtel"
250  * c-basic-offset: 4
251  * tab-width: 8
252  * indent-tabs-mode: nil
253  * End:
254  */
vector< CTokenNo > COccurrBuffer
a type for holding occurrences during reading from the disk
Definition: ConcCommon.h:480
map< DWORD, VecT > MapT
Definition: ddcMMap.h:591
void SetIndexItemOffset(DWORD Value)
sets a reference of this index item to CStringIndexSet::m_StringBuffer
Definition: IndexSetForQueryingStage.h:82
ddcVecFile< CIndexItem > m_Index
the main index(from strings to the ordered list of their occurrences)
Definition: IndexSetForQueryingStage.h:213
QWORD file_off_t
Definition: utilit.h:179
ddcVecFile< DWORD > CSuffixIndex
Definition: IndexSetForQueryingStage.h:140
Definition: StringIndexator.h:121
void SetEndOccurOffset(DWORD EndOccurOffset)
set the upper bound offset of occurrences in CIndexSetForQueryingStage::m_OccursFp ...
Definition: IndexSetForQueryingStage.h:103
Definition: ddcMMap.h:733
bool HasOneOccurrence() const
Definition: IndexSetForQueryingStage.h:107
static size_t GetMaximalNumberOfRunningTokens()
Definition: IndexSetForQueryingStage.h:72
size_t save_to_bytes(const CIndexItem &i, BYTE *buf)
Definition: IndexSetForQueryingStage.h:116
DWORD m_EndOccurOffset
Definition: IndexSetForQueryingStage.h:59
Definition: ConcCommon.h:85
DWORD m_IndexItemOffsetAndFlags
Definition: IndexSetForQueryingStage.h:56
void AddItemIndexFlags(DWORD Value)
set flags of this index item
Definition: IndexSetForQueryingStage.h:93
const CStringIndexator * m_pParent
a pointer to the collection of indices, which contains a reference to this index
Definition: IndexSetForQueryingStage.h:222
DWORD GetIndexItemOffset() const
returns a reference of this index item to CStringIndexSet::m_StringBuffer
Definition: IndexSetForQueryingStage.h:77
PeriodsDivisionMapT::MapT PeriodsDivisionMapR
low-level resident std::map<> implementation in PeriodsDivisionMap::m_map
Definition: IndexSetForQueryingStage.h:35
size_t restore_from_bytes(CIndexItem &i, const BYTE *buf)
Definition: IndexSetForQueryingStage.h:122
ddcMapFile< DWORD, DWORD > PeriodsDivisionMapT
a type for mappping an index item no to its period division
Definition: IndexSetForQueryingStage.h:34
const DWORD TheOnlyOccurIsInEndOccurNo
Definition: IndexSetForQueryingStage.h:43
DWORD GetItemIndexFlags() const
returns flags of this index item
Definition: IndexSetForQueryingStage.h:88
DWORD GetEndOccurOffset() const
return the upper bound offset of occurrences in CIndexSetForQueryingStage::m_OccursFp ...
Definition: IndexSetForQueryingStage.h:98
Definition: IndexSetForQueryingStage.h:54
DWORD GetIndexItemOffsetAndFlags() const
return m_IndexItemOffsetAndFlags for serialization
Definition: IndexSetForQueryingStage.h:63
void SetIndexItemOffsetAndFlags(DWORD Value)
set m_IndexItemOffsetAndFlags for serialization
Definition: IndexSetForQueryingStage.h:68
const DWORD AllFlags
Definition: IndexSetForQueryingStage.h:47
bool m_bCompressOccurrences
if true, then the occurrences should be compresses (up to 30% for huge corpora)
Definition: IndexSetForQueryingStage.h:225
PeriodsDivisionMapT m_EndPeriodOffsets
all corpus period divisions for the long occurrence lists
Definition: IndexSetForQueryingStage.h:219
unsigned char BYTE
Definition: utilit.h:94
size_t get_size_in_bytes(const CIndexItem &t)
Definition: IndexSetForQueryingStage.h:112
CItemIndexForLoading holds all occurrences of one index item (token, morph. pattern...) on the index stage.
Definition: IndexSetForLoadingStage.h:46
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
uint32_t DWORD
Definition: utilit.h:105
Definition: IndexSetForQueryingStage.h:154
Definition: ddcMMap.h:382
ddcFileOrMMap m_OccursFp
the main file of occurrences
Definition: IndexSetForQueryingStage.h:156
CSuffixIndex m_rIndex
optional auxiliary index for suffix-queries; ItemIds lexicographically sorted by reverse string-value...
Definition: IndexSetForQueryingStage.h:216
PeriodsDivisionMapT::RecFileT PeriodsDivisionMapV
low-level virtual mmap()-based implementation in PeriodsDivisionMap::m_rfile
Definition: IndexSetForQueryingStage.h:36