ddc
IndexSetForLoadingStage.h
Go to the documentation of this file.
1 // DDC originally by Alexey Sokirko
2 // Changes and modifications 2011-2018 by Bryan Jurish
3 //
4 // This file is part of DDC.
5 //
6 // DDC is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU Lesser General Public License as published by
8 // the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
10 //
11 // DDC is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU Lesser General Public License for more details.
15 //
16 // You should have received a copy of the GNU Lesser General Public License
17 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
18 //
19 #ifndef IndexSetForIndexingStage_h
20 #define IndexSetForIndexingStage_h
21 
22 
23 #include "DDCLessOperators.h"
24 #include "../CommonLib/ddcMMap.h"
25 
26 //===========================================================
27 // globals
28 
35 extern bool ddcEnableAnonymousTokens;
36 
37 
39 
47 {
49  vector< CTokenNo >* m_pCurrOccurs;
50 
53 
54 public:
55 
57  size_t GetIndexItemOffset() const
58  {
59  return m_IndexItemOffset;
60  };
62  void SetIndexItemOffset(size_t Value)
63  {
64  m_IndexItemOffset = Value;
65  };
66 
68  vector<CTokenNo>* GetOccurs()
69  {
70  assert (m_pCurrOccurs);
71  return m_pCurrOccurs;
72  };
73 
75  const vector<CTokenNo>* GetOccurs() const
76  {
77  assert (m_pCurrOccurs);
78  return m_pCurrOccurs;
79  };
80  // return the number of occurrences
81  size_t GetOccursSize() const
82  {
83  return m_pCurrOccurs->size();
84  };
86  bool InitOccurs();
87 
89  void FreeOccurs();
90 
92  bool WriteOccurrences(FILE* fp) const;
93 
95  bool CheckOccurrences(CTokenNo EndTokenNo) const;
96 
98  void ClearOccurrences();
99 
101  bool ReadFromTemporalFile (FILE* fp);
102 
104  void WriteToTemporalFile (FILE* fp) const;
105 };
106 
107 
108 
109 
110 
112 
127 {
130 
133 
134 
137 
139  virtual string GetName() const = 0;
140 
142  vector<CItemIndexForLoading > m_MemoryLoadIndexHash[256] ;
143 
145  vector<CItemIndexForLoading > m_InputLoadIndexHash[256] ;
146 
148  bool FindIndexItemInVector (const char* Item, vector<CItemIndexForLoading>::iterator& it, vector<CItemIndexForLoading>& V);
149 
150  // finds an item in the swap index set, if it is not found, finds the item in the file index set
151  bool FindIndexItem (const char* Item, vector<CItemIndexForLoading>::iterator& it);
152 
153  bool AddToMemoryLoadIndexAndClear(vector<CItemIndexForLoading>& Body, vector<CItemIndexForLoading>& FileIndexSet);
154  int GetHashNo(const char* Str) const;
155 
156 protected:
163 
165  size_t AddItemStrToBuffer(const char* Str, size_t StrLen);
166 
167 
168 public:
171 
174 
175 
177  virtual ~CIndexSetForLoadingStage ();
178 
180  bool CreateTempFiles (string Path);
182  bool DeleteTempFiles();
184  size_t GetMemoryLoadIndexItemsCount() const;
186  bool SaveMemoryLoadIndex();
188  bool AddInputLoadIndexToMemoryLoadIndex();
190  void SortInputAndMemoryIndices();
192  bool AddMemoryLoadIndexToMainLoadIndex();
194  void InsertToInputLoadIndex(const char* Str, size_t StrLen, const vector<CTokenNo>& occurrences);
195 
197  void RollbackLoadIndex(CTokenNo startTrimTokenNo);
198 
200  void PrintLoadIndexStats(FILE *f=stderr) const;
201 };
202 
203 
204 
205 #endif
206 
207 /*--- emacs style variables ---
208  * Local Variables:
209  * mode: C++
210  * c-file-style: "ellemtel"
211  * c-basic-offset: 4
212  * tab-width: 8
213  * indent-tabs-mode: nil
214  * End:
215  */
void WriteToTemporalFile(FILE *fp) const
write vector of occurrences to a temporal file
Definition: IndexSetForLoadingStage.cpp:136
string m_CurrOccurTempFileName
a temporary file, where the memory index set is stored
Definition: IndexSetForLoadingStage.h:136
bool ddcEnableAnonymousTokens
Definition: IndexSetForLoadingStage.cpp:36
size_t GetIndexItemOffset() const
gets the reference to the index item
Definition: IndexSetForLoadingStage.h:57
FILE * m_TempStorageFile
a temporal file for index storage
Definition: IndexSetForLoadingStage.h:158
bool CheckOccurrences(CTokenNo EndTokenNo) const
checks the order of occurrences
Definition: IndexSetForLoadingStage.cpp:78
bool ReadFromTemporalFile(FILE *fp)
read vector of occurrences from a temporal file
Definition: IndexSetForLoadingStage.cpp:115
vector< CTokenNo > * m_pCurrOccurs
current vector of occurrences
Definition: IndexSetForLoadingStage.h:49
vector< CTokenNo > * GetOccurs()
gets vector of occurrences
Definition: IndexSetForLoadingStage.h:68
bool InitOccurs()
initializes vector of occurrences
Definition: IndexSetForLoadingStage.cpp:44
string m_MainOccurTempFileName
a temporary file, where the main index is stored
Definition: IndexSetForLoadingStage.h:162
bool WriteOccurrences(FILE *fp) const
writes vector of occurrences to a file
Definition: IndexSetForLoadingStage.cpp:72
const vector< CTokenNo > * GetOccurs() const
gets vector of occurrences (const)
Definition: IndexSetForLoadingStage.h:75
bool m_bUseItemStorage
if true, then the program creates and uses a storage for this index
Definition: IndexSetForLoadingStage.h:170
size_t m_IndexItemOffset
a reference to CStringIndexSet::m_StringBuffer
Definition: IndexSetForLoadingStage.h:52
size_t GetOccursSize() const
Definition: IndexSetForLoadingStage.h:81
void FreeOccurs()
deletes vector of occurrences
Definition: IndexSetForLoadingStage.cpp:56
CIndexSetForLoadingStage is a part of DDC which is used only on the loading stage.
Definition: IndexSetForLoadingStage.h:126
CItemIndexForLoading holds all occurrences of one index item (token, morph. pattern...) on the index stage.
Definition: IndexSetForLoadingStage.h:46
void ClearOccurrences()
clears vector of occurrences to a file
Definition: IndexSetForLoadingStage.cpp:107
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
void SetIndexItemOffset(size_t Value)
sets the reference to the index item
Definition: IndexSetForLoadingStage.h:62
string m_TempStorageFileName
a temporary file, where the index storage is stored
Definition: IndexSetForLoadingStage.h:160
LessIndexString2< CItemIndexForLoading > m_LoadLess2
a less operator for two buffer pointers
Definition: IndexSetForLoadingStage.h:129
ddcVecFile< char > m_StringBuffer
a buffer for storing index strings (compile-time)
Definition: IndexSetForLoadingStage.h:173
LessIndexString1< CItemIndexForLoading > m_LoadLess1
a less operator for a buffer pointer and a const char*
Definition: IndexSetForLoadingStage.h:132