ddc
StringIndexator.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2020 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 #ifndef StringIndexator_h
22 #define StringIndexator_h
23 
24 #include "../ConcordLib/IndexSet.h"
25 
108 //const char ddc_archive_stub[] = "ddc_archive_stub";
109 const char ddc_archive_stub[] = "";
110 
112  vector<CStringIndexSet*> ColumnMap;
113  vector<CStringIndexSet*> AbsentIndices;
115 };
116 
122 
123 protected:
124 
126  vector<CTokenNo> m_SearchPeriods;
127 
129  bool RegisterChunkIndex();
131  string GetSearchPeriodsFileName() const;
133  bool DestroyIndices();
135  void ReadIndicesFromTheDisk();
137  void ClearStringIndices();
139  void IndexOneToken (CTokenIndexator* document, const char* Line, bool tryFixErrors=true);
140 
142  // \param MaxLen maximum length in bytes of \a Out, including terminating NUL byte
143  // \param nCols number of TAB-separated columns to write to \a Out ("globalFieldDelimeter" (sic))
144  // \param InputLine TAB-separated input token line string with length greater than or equal to \a MaxLen
145  // \param Out output string, should already be allocated with at least \a MaxLen bytes
146  // \detail
147  // \li called implicitly by IndexOneToken() encounters a too-long line if \a tryFixErrors is true
148  // \li truncates only "greedy" columns (those using more than MaxLen/nCols bytes)
149  // \li truncates each "greedy" column indepdently, proportionate to its "greediness" (the number of bytes by which its length exceeds MaxLen/nCols)
150  void IndexTokenFixLongColumns(const size_t MaxLen, const size_t nCols, const char *InputLine, char *Out);
151 
152 public:
154  typedef map<string,string> IndexAliasMap;
155 
157  typedef map<string,CStringIndexSet*> IndexMap;
158 
159 public:
161  string m_Path;
162 
165 
167  vector<CStringIndexSet*> m_Indices;
168 
170  IndexAliasMap m_IndexAlias;
171 
173  IndexMap m_IndexMap;
174 
177 
180 
182  ~CStringIndexator();
183 
185  bool RegisterStringIndices(const string& IndicesStr);
186 
188  bool RegisterIndexAliases(const string& IndexAliasStr);
189 
191  bool RegisterIndexAlias(const string& AliasFrom, const string& AliasTo);
192 
194  void RegisterIndexAlias(const string& AliasFrom, CStringIndexSet* idx);
195 
197  void SetPath(string Path);
199  string GetIndicesString() const;
201  string GetIndexAliasString() const;
202 
204  size_t GetSearchPeriodsCount() const;
206  const CTokenNo& GetSearchPeriod(size_t i) const { return m_SearchPeriods[i]; };
207 
209  bool StartIndexing(string Path);
211  void TerminateIndexing();
213  bool FinalSaveAllIndices(bool bAfterLoading);
215  bool AddInputLoadIndexToMemoryLoadIndex();
217  bool AddMemoryLoadIndexToMainLoadIndex();
219  bool SaveMemoryLoadIndex();
221  CStringIndexSet* GetIndexByName(const string& Name);
223  CStringIndexSet* GetIndexByNameOrShortName(const string& Name);
225  CStringIndexSet* GetIndexByAlias(const string &Alias) const;
227  CStringIndexSet* GetTokenIndex();
229  const CStringIndexSet* GetTokenIndex() const;
230 
231 };
232 
233 
234 
235 
236 
237 #endif
238 
239 /*--- emacs style variables ---
240  * Local Variables:
241  * mode: C++
242  * c-file-style: "ellemtel"
243  * c-basic-offset: 4
244  * tab-width: 8
245  * indent-tabs-mode: nil
246  * End:
247  */
const CTokenNo & GetSearchPeriod(size_t i) const
get a corpus period by an index
Definition: StringIndexator.h:206
void IndexOneToken(const size_t MaxIndexLineLength, size_t ncols, const char *InputLine)
Definition: toktrim.cc:63
Definition: StringIndexator.h:111
bool m_bMemoryMap
whether to directly mmap() index file data (default=false)
Definition: StringIndexator.h:164
Definition: StringIndexator.h:121
vector< CStringIndexSet * > ColumnMap
Definition: StringIndexator.h:112
string m_Path
where all indices are stored
Definition: StringIndexator.h:161
size_t m_MaxRegExpExpansionSize
the maximal number of index items which can be included in an expansion set of one regular expression...
Definition: StringIndexator.h:176
const char ddc_archive_stub[]
Definition: StringIndexator.h:109
vector< CStringIndexSet * > m_Indices
the registered indices, by positional index
Definition: StringIndexator.h:167
IndexAliasMap m_IndexAlias
declared index aliases (FROM -> TO); not really used at runtime
Definition: StringIndexator.h:170
vector< CTokenNo > m_SearchPeriods
search periods of the corpus
Definition: StringIndexator.h:126
map< string, CStringIndexSet * > IndexMap
typedef for index symbol table
Definition: StringIndexator.h:157
Definition: morph_const.h:107
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
IndexMap m_IndexMap
all registered indices, keyed by long-name, short-name, or label (LABEL -> INDEX) ...
Definition: StringIndexator.h:173
CTokenNo CorpusEndTokenNo
Definition: StringIndexator.h:114
vector< CStringIndexSet * > AbsentIndices
Definition: StringIndexator.h:113
Definition: IndexSet.h:57
map< string, string > IndexAliasMap
typedef for index alias maps
Definition: StringIndexator.h:154
CStringIndexSet * m_pChunkIndex
a quick reference to a chunk index, if CConcIndexator::m_bIndexChunks is on, otherwise null ...
Definition: StringIndexator.h:179