ddc
ddcStringEnum.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // Copyright 2018 by Bryan Jurish
4 //
5 // This file is part of DDC.
6 // DDC originally by Alexey Sokirko
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 #ifndef DDC_STRING_ENUM_H
22 #define DDC_STRING_ENUM_H
23 
24 #include "ddcMMap.h"
25 #include <limits>
26 
27 //======================================================================
28 // ddcStringEnum
30 template <typename OffT_=DWORD>
32 {
33 public:
34  //----------------------------------------------------------
35  // typedefs
36  typedef OffT_ OffT;
37  typedef DWORD IdT;
38 
39 public:
40  //----------------------------------------------------------
41  // data
44  vector<string> m_strings;
45 
46  size_t m_bufsize;
47 
48 public:
49  //----------------------------------------------------------
50  // constructors etc.
51 
54  : m_bufsize(20000)
55  {};
56 
58  ddcStringEnum(const string& cdataFile, const string& offsetsFile, const string& fallbackFile="", bool useMMap=false)
59  : m_bufsize(20000)
60  { open(cdataFile,offsetsFile,fallbackFile,useMMap); };
61 
64  { close(); };
65 
67  inline void clear()
68  { close(); };
69 
70 public:
71  //----------------------------------------------------------
72  // accessors & properties
73 
75  inline size_t size() const
76  { return m_offsets ? m_offsets.size() : m_strings.size(); };
77 
79  inline bool empty() const
80  { return m_offsets ? m_offsets.empty() : m_strings.empty(); };
81 
83  inline bool opened() const
84  { return m_offsets.opened() || !m_strings.empty(); };
85 
86 public:
87  //----------------------------------------------------------
88  // typecast operators
89 
91  inline operator bool() const
92  { return opened(); };
93 
95  inline const char* operator[](size_t idx) const
96  { return m_offsets ? (m_cdata.m_data+m_offsets.m_data[idx]) : m_strings[idx].c_str(); };
97 
98  //----------------------------------------------------------
99  // read/write access (auto-demote)
100 
102  inline void push_back(const string& value)
103  {
104  ensureVec();
105  m_strings.push_back(value);
106  };
107 
109  void to_vector(vector<string>& v) const
110  {
111  ClearVector(v);
112  v.reserve(size());
113  if (m_offsets) {
114  for (const OffT* oi=m_offsets.begin(); oi != m_offsets.end(); ++oi)
115  v.push_back( m_cdata.m_data + *oi );
116  } else {
117  v = m_strings;
118  }
119  };
120 
122  inline void ensureVec()
123  {
124  if (m_offsets) {
125  ddcLogWarn(Format("WARNING: ddcStringEnum(\"%s\") demoting to std::vector<> implementation", m_cdata.filename().c_str()));
126  to_vector(m_strings);
127  m_offsets.close();
128  m_cdata.close();
129  }
130  };
131 
132 public:
133  //----------------------------------------------------------
134  // search
135 
136 
138  //-- numeric_limits<>::max():
139  // + g++ 5.4.0 (ubuntu 16.04.3 / kira) chokes on initialization via std::numeric_limits<IdT>::max() with:
140  // "error: a function call cannot appear in a constant-expression"
141  // + we don't have IdT parameterized yet anyway (it's all 32-bit at this point), so we can just use DWORD_MAX
142  //static const IdT NO_ID = std::numeric_limits<IdT>::max();
143  static const IdT NO_ID = DWORD_MAX;
144 
146  struct IdKeyLess {
147  const char *m_buf;
148 
149  IdKeyLess(const char *buf)
150  : m_buf(buf)
151  {};
153  : m_buf(se.m_cdata.m_data)
154  {};
155 
156 
157  inline bool operator()(const OffT off, const string& key)
158  { return (m_buf+off) < key; };
159  inline bool operator()(const string& key, const OffT off)
160  { return key < (m_buf+off); };
161  };
162 
164  IdT lower_bound(const string& key) const
165  {
166  if (m_offsets) {
167  //-- ddc-v2.1.12 version
168  typename ddcVecFile<OffT>::const_iterator lb = std::lower_bound(m_offsets.begin(), m_offsets.end(), key, IdKeyLess(*this));
169  if (lb != m_offsets.end()) return (lb - m_offsets.begin());
170  } else {
171  //-- backwards-compatible implementation using m_strings
172  vector<string>::const_iterator lb = std::lower_bound(m_strings.begin(), m_strings.end(), key);
173  if (lb != m_strings.end()) return (lb - m_strings.begin());
174  }
175  return NO_ID;
176  };
177 
179  inline IdT upper_bound(const string& key) const
180  {
181  if (m_offsets) {
182  //-- ddc-v2.1.12 version
183  typename ddcVecFile<OffT>::const_iterator ub = std::upper_bound(m_offsets.begin(), m_offsets.end(), key, IdKeyLess(*this));
184  if (ub != m_offsets.end()) return (ub - m_offsets.begin());
185  } else {
186  //-- backwards-compatible implementation using m_strings
187  vector<string>::const_iterator ub = std::upper_bound(m_strings.begin(), m_strings.end(), key);
188  if (ub != m_strings.end()) return (ub - m_strings.begin());
189  }
190  return NO_ID;
191  };
192 
194  inline IdT find(const string& key) const
195  {
196  IdT lbid = lower_bound(key);
197  return (lbid != NO_ID && operator[](lbid) == key) ? lbid : NO_ID;
198  };
199 
200 public:
201  //----------------------------------------------------------
202  // guts
203 
205  ddcStringEnum& open(const string& cdataFile, const string& offsetsFile, const string& fallbackFile, bool useMMap=false)
206  {
207  if (opened()) close();
208 
209  if (FileExists(cdataFile.c_str()) && FileExists(offsetsFile.c_str())) {
210  m_cdata.open(cdataFile, useMMap);
211  m_offsets.open(offsetsFile, useMMap);
212  ddcLogDebug(Format("ddcStringEnum[%s]::open(): read %zi item(s) from %s + %s", (useMMap ? "mmap" : "slurp"), size(), cdataFile.c_str(), offsetsFile.c_str()));
213  }
214  else if (!fallbackFile.empty() && FileExists(fallbackFile.c_str())) {
215  ddcLogWarn("WARNING: ddcStringEnum::open(): loading fallback file '"+fallbackFile+"' in slurp mode: please upgrade your index!");
216  openCompat(fallbackFile);
217  ddcLogDebug(Format("ddcStringEnum[compat]::open(): read %zi item(s) from %s", size(), fallbackFile.c_str()));
218  } else {
219  throw invalid_argument("ddcStringEnum::open(\""+cdataFile+"\",\""+offsetsFile+"\",\""+fallbackFile+"\"): no file(s) found!");
220  }
221  return *this;
222  };
223 
225  void openCompat(const string& filename)
226  {
227  if (opened()) close();
228 
229  FILE * fp = fopen(filename.c_str(), "rb");
230  if (!fp)
231  throw invalid_argument(Format("ddcStringEnum::openCompat(): open failed for '%s': %s'", filename.c_str(), strerror(errno)));
232 
233  char buf[m_bufsize+1];
234  while (fgets(buf, m_bufsize, fp)) {
235  string q(buf);
236  Trim(q);
237  m_strings.push_back(q);
238  }
239 
240  fclose(fp);
241  };
242 
244  void close()
245  {
246  m_cdata.close();
247  m_offsets.close();
248  ClearVector(m_strings);
249  };
250 
251 };
252 
253 
254 
255 #endif /* DDC_STRING_ENUM_H */
256 
257 /*--- emacs style variables ---
258  * Local Variables:
259  * mode: C++
260  * c-file-style: "ellemtel"
261  * c-basic-offset: 4
262  * tab-width: 8
263  * indent-tabs-mode: nil
264  * End:
265  */
ddcVecFile< char > m_cdata
underlying character data
Definition: ddcStringEnum.h:42
bool opened() const
check whether the object is opened
Definition: ddcStringEnum.h:83
size_t m_bufsize
buffer size for fallback loading (default=20000: old value for MaxBiblStringLen from Bibliography...
Definition: ddcStringEnum.h:46
void openCompat(const string &filename)
open in compatibility-mode; lifted from CConcXml::CFreeBiblStringIndex::ReadBiblStringItems (vector<s...
Definition: ddcStringEnum.h:225
string Format(const char *format,...)
Definition: ddcString.cpp:393
string & Trim(string &str)
Definition: utilit.cpp:1762
void push_back(const string &value)
simulate vector::push_back(): used by indexing routines
Definition: ddcStringEnum.h:102
void to_vector(vector< string > &v) const
populate v with an equivalent vector-implementation (for compile-time compatibility) ...
Definition: ddcStringEnum.h:109
IdT upper_bound(const string &key) const
returns id of upper-bound for key, or NO_ID if not found
Definition: ddcStringEnum.h:179
bool empty() const
STL-esque wrapper.
Definition: ddcMMap.h:268
ddcStringEnum(const string &cdataFile, const string &offsetsFile, const string &fallbackFile="", bool useMMap=false)
construct and open
Definition: ddcStringEnum.h:58
ddcVecFile & open(const std::string &filename, bool useMMap=false)
map a named file filename
Definition: ddcMMap.h:312
void close()
unmap current file(s), if any
Definition: ddcStringEnum.h:244
bool FileExists(const char *FName)
Definition: utilit.cpp:335
size_t size() const
returns the number of enumerated strings
Definition: ddcStringEnum.h:75
IdKeyLess(const ddcStringEnum &se)
Definition: ddcStringEnum.h:152
ddcVecFile< OffT > m_offsets
offsets in m_cdata[] of NUL-terminated item strings; associated strings must be sorted in lexicograph...
Definition: ddcStringEnum.h:43
IdT lower_bound(const string &key) const
returns id of lower-bound for key, or NO_ID if not found
Definition: ddcStringEnum.h:164
void ensureVec()
force vector-implementation
Definition: ddcStringEnum.h:122
bool operator()(const OffT off, const string &key)
Definition: ddcStringEnum.h:157
void ClearVector(vector< T > &V)
Definition: utilit.h:493
#define ddcLogWarn(Msg)
Definition: ddcLog.h:106
size_t size() const
returns the number of objects of type T in this vector, or 0
Definition: ddcMMap.h:265
bool operator()(const string &key, const OffT off)
Definition: ddcStringEnum.h:159
bool opened() const
returns true iff m_data is non-NULL
Definition: ddcMMap.h:271
const char * m_buf
Definition: ddcStringEnum.h:147
comparison helper struct for string-to-id search
Definition: ddcStringEnum.h:146
~ddcStringEnum()
default destructor calls close()
Definition: ddcStringEnum.h:63
bool empty() const
STL-esque wrapper.
Definition: ddcStringEnum.h:79
T * m_data
data (pointer to m_mmap or m_vec data)
Definition: ddcMMap.h:236
void close()
unmap current file (if any)
Definition: ddcMMap.h:330
vector< string > m_strings
backwards-compatible vector<string> implementation, for read/write access
Definition: ddcStringEnum.h:44
OffT_ OffT
typedef for offset values
Definition: ddcStringEnum.h:36
Definition: ddcStringEnum.h:31
ddcStringEnum & open(const string &cdataFile, const string &offsetsFile, const string &fallbackFile, bool useMMap=false)
map a named file filename
Definition: ddcStringEnum.h:205
IdT find(const string &key) const
returns id of key, or NO_ID if not found; wraps lower_bound()
Definition: ddcStringEnum.h:194
DWORD IdT
typedef for integer IDs
Definition: ddcStringEnum.h:37
uint32_t DWORD
Definition: utilit.h:105
IdKeyLess(const char *buf)
Definition: ddcStringEnum.h:149
const string & filename() const
returns filename of last open(), or NULL
Definition: ddcMMap.h:259
static const IdT NO_ID
constant returned by string-to-id search methods indicating no match was found
Definition: ddcStringEnum.h:143
iterator begin()
Definition: ddcMMap.h:300
void clear()
clear() method wraps close()
Definition: ddcStringEnum.h:67
#define ddcLogDebug(Msg)
Definition: ddcLog.h:112
ddcStringEnum()
default constructor
Definition: ddcStringEnum.h:53
iterator end()
Definition: ddcMMap.h:302
const char * operator[](size_t idx) const
indexing operator
Definition: ddcStringEnum.h:95