ddc
ddcCorpusList.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // Copyright 2018 by Bryan Jurish
4 //
5 // This file is part of DDC.
6 // DDC originally by Alexey Sokirko
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 #ifndef DDC_CORPUS_FILES_H
22 #define DDC_CORPUS_FILES_H
23 
24 #include "ddcMMap.h"
25 #include "ddcVersion.h"
26 
27 //======================================================================
28 // ddcCorpusList
30 template <typename OffT_=DWORD>
32 {
33 public:
34  //----------------------------------------------------------
35  // typedefs
36  typedef OffT_ OffT;
37  typedef DWORD IdT;
38 
39 public:
40  //----------------------------------------------------------
41  // data
44  vector<string> m_strings;
45  string m_verstr;
46 
47  size_t m_bufsize;
48 
49 public:
50  //----------------------------------------------------------
51  // constructors etc.
52 
55  : m_bufsize(20000)
56  {};
57 
59  ddcCorpusList(const string& conFile, const string& cdataFile="", const string& offsetsFile="", bool useMMap=false)
60  : m_bufsize(20000)
61  { open(conFile,cdataFile,offsetsFile,useMMap); };
62 
65  { close(); };
66 
68  inline void clear()
69  { close(); };
70 
71 public:
72  //----------------------------------------------------------
73  // accessors & properties
74 
76  inline size_t size() const
77  { return m_offsets ? m_offsets.size() : m_strings.size(); };
78 
80  inline bool empty() const
81  { return m_offsets ? m_offsets.empty() : m_strings.empty(); };
82 
84  inline bool opened() const
85  { return m_offsets.opened() || !m_strings.empty(); };
86 
88  const string& VersionString() const
89  { return m_verstr; };
90 
92  inline OffT itemStartOffset(IdT idx) const
93  { return m_offsets[idx]; };
94 
96  inline OffT itemEndOffset(IdT idx) const
97  { return ((idx+1)>=size() ? m_cdata.size() : m_offsets[idx+1]) - 1; };
98 
100  inline void itemGet(IdT idx, string& str) const
101  {
102  OffT i = itemStartOffset(idx);
103  OffT j = itemEndOffset(idx);
104  str.assign( &m_cdata[i], j-i );
105  };
106 
107 
108 public:
109  //----------------------------------------------------------
110  // operators
111 
113  inline operator bool() const
114  { return opened(); };
115 
117  inline string operator[](IdT idx) const
118  {
119  if (!m_offsets) return m_strings[idx];
120  OffT i = itemStartOffset(idx);
121  OffT j = itemEndOffset(idx);
122  return string( &m_cdata[i], j-i );
123  };
124 
125  //----------------------------------------------------------
126  // read/write access (auto-demote)
127 
129  inline void push_back(const string& value)
130  {
131  ensureVec();
132  m_strings.push_back(value);
133  };
134 
136  void append_vector(vector<string>& v) const
137  {
138  v.reserve(v.size()+size());
139  if (m_offsets) {
140  for (IdT idx=0; idx < size(); ++idx)
141  v.push_back(operator[](idx));
142  } else {
143  v.insert(v.end(), m_strings.begin(), m_strings.end());
144  }
145  };
146 
148  inline void ensureVec()
149  {
150  if (m_offsets) {
151  ddcLogWarn(Format("WARNING: ddcCorpusList(\"%s\") demoting to std::vector<> implementation", m_cdata.filename().c_str()));
152  ClearVector(m_strings);
153  append_vector(m_strings);
154  m_offsets.close();
155  m_cdata.close();
156  }
157  };
158 
159 public:
160  //----------------------------------------------------------
161  // search
162 
164  //-- re numeric_limits<>::max(): see NO_ID comments in ddcStringEnum.h
165  //static const IdT NO_ID = std::numeric_limits<IdT>::max();
166  static const IdT NO_ID = DWORD_MAX;
167 
169  inline IdT find(const string& key) const
170  {
171  if (m_offsets) {
172  string tmp;
173  for (IdT idx=0; idx < size(); ++idx) {
174  itemGet(idx,tmp);
175  if (tmp == key)
176  return idx;
177  }
178  }
179  else {
180  vector<string>::const_iterator it = std::find(m_strings.begin(), m_strings.end(), key);
181  if (it != m_strings.end())
182  return (it - m_strings.begin());
183  }
184  return NO_ID;
185  };
186 
188  inline IdT rfind(const string& key) const
189  {
190  if (m_offsets) {
191  if (empty()) return NO_ID;
192  string tmp;
193  IdT idx = size()-1;
194  do {
195  itemGet(idx,tmp);
196  if (tmp == key)
197  return idx;
198  --idx;
199  } while (idx != 0);
200  }
201  else {
202  vector<string>::const_reverse_iterator it = std::find(m_strings.rbegin(), m_strings.rend(), key);
203  if (it != m_strings.rend())
204  return (m_strings.size()-1) - (it - m_strings.rbegin());
205  }
206  return NO_ID;
207  };
208 
209 public:
210  //----------------------------------------------------------
211  // guts
212 
214  ddcCorpusList& open(const string& conFile, const string& cdataFile="", const string& offsetsFile="", bool useMMap=false)
215  {
216  if (opened()) close();
217 
218  if (!cdataFile.empty() && !offsetsFile.empty() && FileExists(cdataFile) && FileExists(offsetsFile)) {
219  openCompat(conFile,false);
220  m_cdata.open(cdataFile, useMMap);
221  m_offsets.open(offsetsFile, useMMap);
222  ddcLogDebug(Format("ddcCorpusList[%s]::open(): read %zi item(s) from %s + %s", (useMMap ? "mmap" : "slurp"), size(), cdataFile.c_str(), offsetsFile.c_str()));
223  }
224  else {
225  ddcLogWarn("WARNING: ddcCorpusList::open(): loading corpus list '"+conFile+"' in compatibility mode: please upgrade your index!");
226  openCompat(conFile,true);
227  ddcLogDebug(Format("ddcCorpusList[compat]::open(): read %zi item(s) from %s", size(), conFile.c_str()));
228  }
229  //ddc_format_version_check(m_verstr.c_str(), cdataFile.c_str());
230  return *this;
231  };
232 
234  void openCompat(const string& filename, bool slurpList=true)
235  {
236  if (opened()) close();
237 
238  FILE * fp = fopen(filename.c_str(), "rb");
239  if (!fp)
240  throw invalid_argument(Format("ddcCorpusList::openCompat(): open failed for '%s': %s'", filename.c_str(), strerror(errno)));
241 
242  char buf[m_bufsize+1];
243  if (!fgets(buf, m_bufsize, fp))
244  throw invalid_argument("ddcCorpusList::openCompat(): failed to read version information from '"+filename+"'");
245  m_verstr = buf;
246  Trim(m_verstr);
247 
248  if (slurpList) {
249  string tmp;
250  while (fgets(buf, m_bufsize, fp)) {
251  tmp = buf;
252  Trim(tmp);
253  m_strings.push_back(tmp);
254  }
255  }
256 
257  fclose(fp);
258  };
259 
261  void close()
262  {
263  m_cdata.close();
264  m_offsets.close();
265  ClearVector(m_strings);
266  ClearString(m_verstr);
267  };
268 
269 };
270 
271 
272 #endif /* DDC_STRING_ENUM_H */
273 
274 /*--- emacs style variables ---
275  * Local Variables:
276  * mode: C++
277  * c-file-style: "ellemtel"
278  * c-basic-offset: 4
279  * tab-width: 8
280  * indent-tabs-mode: nil
281  * End:
282  */
Definition: ddcCorpusList.h:31
void append_vector(vector< string > &v) const
populate v with an equivalent vector-implementation (for compile-time compatibility) ...
Definition: ddcCorpusList.h:136
ddcCorpusList & open(const string &conFile, const string &cdataFile="", const string &offsetsFile="", bool useMMap=false)
map a named file filename
Definition: ddcCorpusList.h:214
void clear()
clear() method wraps close()
Definition: ddcCorpusList.h:68
void close()
unmap current file(s), if any
Definition: ddcCorpusList.h:261
string Format(const char *format,...)
Definition: ddcString.cpp:393
string & Trim(string &str)
Definition: utilit.cpp:1762
void push_back(const string &value)
simulate vector::push_back(): used by indexing routines
Definition: ddcCorpusList.h:129
void ensureVec()
force vector-implementation
Definition: ddcCorpusList.h:148
vector< string > m_strings
backwards-compatible vector<string> implementation, for read/write access
Definition: ddcCorpusList.h:44
bool empty() const
STL-esque wrapper.
Definition: ddcMMap.h:268
DWORD IdT
typedef for integer IDs
Definition: ddcCorpusList.h:37
const string & VersionString() const
version string
Definition: ddcCorpusList.h:88
ddcVecFile & open(const std::string &filename, bool useMMap=false)
map a named file filename
Definition: ddcMMap.h:312
ddcVecFile< char > m_cdata
underlying character data, including initial version string
Definition: ddcCorpusList.h:42
~ddcCorpusList()
default destructor calls close()
Definition: ddcCorpusList.h:64
IdT find(const string &key) const
returns first id of key, or NO_ID if not found (linear search)
Definition: ddcCorpusList.h:169
bool FileExists(const char *FName)
Definition: utilit.cpp:335
OffT itemEndOffset(IdT idx) const
get end-offset of string by index (mmap variant only)
Definition: ddcCorpusList.h:96
OffT itemStartOffset(IdT idx) const
get start-offset of string by index (mmap variant only)
Definition: ddcCorpusList.h:92
size_t size() const
returns the number of enumerated strings
Definition: ddcCorpusList.h:76
ddcCorpusList(const string &conFile, const string &cdataFile="", const string &offsetsFile="", bool useMMap=false)
construct and open
Definition: ddcCorpusList.h:59
void openCompat(const string &filename, bool slurpList=true)
open in compatibility-mode; lifted from CConcIndexator::LoadCorpusFiles()
Definition: ddcCorpusList.h:234
void ClearVector(vector< T > &V)
Definition: utilit.h:493
#define ddcLogWarn(Msg)
Definition: ddcLog.h:106
size_t size() const
returns the number of objects of type T in this vector, or 0
Definition: ddcMMap.h:265
void ClearString(string &S)
Definition: utilit.h:497
size_t m_bufsize
buffer size for fallback (default is old value for MaxBiblStringLen from Bibliography.cpp)
Definition: ddcCorpusList.h:47
bool opened() const
returns true iff m_data is non-NULL
Definition: ddcMMap.h:271
IdT rfind(const string &key) const
returns last id of key, or NO_ID if not found (linear search)
Definition: ddcCorpusList.h:188
bool empty() const
STL-esque wrapper.
Definition: ddcCorpusList.h:80
OffT_ OffT
typedef for offset values
Definition: ddcCorpusList.h:36
bool opened() const
check whether the object is opened
Definition: ddcCorpusList.h:84
void close()
unmap current file (if any)
Definition: ddcMMap.h:330
static const IdT NO_ID
constant returned by string-to-id search methods indicating no match was found
Definition: ddcCorpusList.h:166
string operator[](IdT idx) const
indexing operator (copies)
Definition: ddcCorpusList.h:117
string m_verstr
version string (1st line of *._con file)
Definition: ddcCorpusList.h:45
uint32_t DWORD
Definition: utilit.h:105
const string & filename() const
returns filename of last open(), or NULL
Definition: ddcMMap.h:259
ddcVecFile< OffT > m_offsets
offsets in m_cdata[] of "\n"-terminated filenames (in corpus order)
Definition: ddcCorpusList.h:43
ddcCorpusList()
default constructor
Definition: ddcCorpusList.h:54
#define ddcLogDebug(Msg)
Definition: ddcLog.h:112
void itemGet(IdT idx, string &str) const
pseudo-indexing operator
Definition: ddcCorpusList.h:100