ddc
IndexSet.h
Go to the documentation of this file.
1 //*-*- Mode: C++ -*-*/
2 
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2018 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 
22 #ifndef IndexSet_h
23 #define IndexSet_h
24 
25 #include "../ConcordLib/IndexSetForLoadingStage.h"
26 #include "../ConcordLib/IndexSetForQueryingStage.h"
27 #include "../PCRE/pcre_rml.h"
28 
31 struct COutputToken {
33  string m_TokenStr;
35  string m_InterpStr;
38 
39  COutputToken() : m_Highlight(0) {};
40 
41  COutputToken(string WordStr, BYTE Highlight) : m_TokenStr(WordStr), m_Highlight(Highlight) {};
42 
43  bool operator==(const COutputToken &X) const {
44  return m_TokenStr == X.m_TokenStr;
45  };
46 };
47 
48 
49 class CStringIndexator;
50 
51 
60 
61 
63  bool ConvertLoadIndexToWorkingIndex();
64 
65 
67  void CreateUnionTokenStorages(const vector<CStringIndexSet *> &IndexSets, const vector<size_t> &TokenCounts,
68  const vector<vector<DWORD> > &xlateItemId);
69 
71  void OpenStorageFile();
72 
73 
75  string GetName() const;
76 
78  bool ConvertTempStorageToPersistent();
79 
80 public:
81 
83  string m_Name;
84 
86  string m_ShortName;
87 
89  template<class T>
90  inline const char *GetIndexItemStr(const T &W) const {
91  return m_StringBuffer.m_data + W.GetIndexItemOffset();
92  };
93 
94  CStringIndexSet(const CStringIndexator *pParent);
95 
96  ~CStringIndexSet();
97 
99  string GetStorageFileName() const;
100 
102  void CloseStorageFile();
103 
105  void InitIndexSet(string Name, string ShortName, bool bCreateItemStorage, bool bCompress);
106 
108  void ReadFromTheDisk();
109 
111  bool DestroyIndexSet();
112 
114  bool WriteToFile(bool bAfterLoading);
115 
117  void UnionIndexSets(const vector<CStringIndexSet *> &IndexSets, const vector<size_t> &TokenCounts);
118 
120  void CreateSplitPartitions(vector<CStringIndexSet *> &Partitions, const vector<CTokenNo> &EndTokenNo) const;
121 
123  void EnsureSuffixIndex();
124 
126  bool GetTokensFromStorage(const size_t start_offset, const size_t end_offset, vector<COutputToken> &Tokens) const;
127 
132  DWORD GetTokenIndexId(CTokenNo TokenNo) const;
133 
138  ddcVecFile<CIndexItem>::const_iterator GetTypeIndexIdLowerBoundIter(const string &ValueStr) const;
139 
144  ddcVecFile<CIndexItem>::const_iterator GetTypeIndexIdUpperBoundIter(const string &ValueStr) const;
145 
149  DWORD GetTypeIndexIdLowerBound(const string &ValueStr) const;
150 
152  DWORD GetTypeIndexId(const string &ValueStr) const;
153 
155  bool GetContextBounds(CDDCFilterWithBounds &Filter, const string &LoValue, const string &HiValue) const;
156 
158  void FindOccurrences(const vector<DWORD> &IndexItems, const size_t PeriodNo, vector<CTokenNo> &occurrences,
159  CShortOccurCacheMap *pCaches, vector<int> &CacheIds) const;
160 
162  void
163  FindChunkOccurrences(const vector<DWORD> &IndexItems, vector<CTokenNo> &occurrences, vector<DWORD> &ChunkLengths,
164  size_t PeriodNo, CShortOccurCacheMap *pCaches, vector<int> &CacheIds) const;
165 
167  void QueryTokenList(const string &WordForm, vector<DWORD> &MatchWords) const;
168 
170  void QueryTokenListWithRightTruncation(const string &Prefix, vector<DWORD> &MatchWords) const;
171 
173  void QueryTokenListWithLeftTruncation(const string &Suffix, vector<DWORD> &MatchWords) const;
174 
176  void QueryTokenListUsingRegExp(RML_RE &RegExp, vector<DWORD> &MatchWords, bool negated = false,
177  bool ignore_diacritics = false) const;
178 
180  void QueryTokenListUniversal(vector<DWORD> &MatchWords) const;
181 
183  void DumpStorage(FILE *output) const;
184 };
185 
186 
187 #endif
188 
189 /*--- emacs style variables ---
190  * Local Variables:
191  * mode: C++
192  * c-file-style: "ellemtel"
193  * c-basic-offset: 4
194  * tab-width: 8
195  * indent-tabs-mode: nil
196  * End:
197  */
bool operator==(const COutputToken &X) const
Definition: IndexSet.h:43
Definition: ddcMMap.h:226
const char * GetIndexItemStr(const T &W) const
this function returns a string(which was indexed by DDC) by an instance of CIndexItem or CItemIndexFo...
Definition: IndexSet.h:90
Definition: StringIndexator.h:121
string m_InterpStr
the interpretation of the token
Definition: IndexSet.h:35
Definition: ddcMMap.h:733
COutputToken()
Definition: IndexSet.h:39
map< string, CShortOccurCache > CShortOccurCacheMap
a type for index string to its occurrences
Definition: ConcCommon.h:477
ddcFileOrMMap m_StorageFile
a file for index storage
Definition: IndexSet.h:59
COutputToken(string WordStr, BYTE Highlight)
Definition: IndexSet.h:41
Definition: morph_const.h:107
string m_Name
the main name of the index set, for example "Token", "MorphPattern", "Thes", "Chunk"...
Definition: IndexSet.h:83
unsigned char BYTE
Definition: utilit.h:94
CIndexSetForLoadingStage is a part of DDC which is used only on the loading stage.
Definition: IndexSetForLoadingStage.h:126
Definition: ConcCommon.h:318
Definition: pcre_rml.h:41
string m_TokenStr
the token itself
Definition: IndexSet.h:33
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
string m_ShortName
a short name of the index set, for example "m", "w", "t", "c"
Definition: IndexSet.h:86
uint32_t DWORD
Definition: utilit.h:105
Definition: IndexSetForQueryingStage.h:154
Definition: IndexSet.h:31
Definition: IndexSet.h:57
BYTE m_Highlight
how DDC should highlight this token in the output hit
Definition: IndexSet.h:37