21 #ifndef DDC_CORPUS_FILES_H 22 #define DDC_CORPUS_FILES_H 30 template <
typename OffT_=DWORD>
59 ddcCorpusList(
const string& conFile,
const string& cdataFile=
"",
const string& offsetsFile=
"",
bool useMMap=
false)
61 {
open(conFile,cdataFile,offsetsFile,useMMap); };
77 {
return m_offsets ? m_offsets.
size() : m_strings.size(); };
81 {
return m_offsets ? m_offsets.
empty() : m_strings.empty(); };
85 {
return m_offsets.
opened() || !m_strings.empty(); };
93 {
return m_offsets[idx]; };
97 {
return ((idx+1)>=
size() ? m_cdata.
size() : m_offsets[idx+1]) - 1; };
100 inline void itemGet(IdT idx,
string& str)
const 104 str.assign( &m_cdata[i], j-i );
113 inline operator bool()
const 119 if (!m_offsets)
return m_strings[idx];
122 return string( &m_cdata[i], j-i );
132 m_strings.push_back(value);
138 v.reserve(v.size()+
size());
140 for (IdT idx=0; idx <
size(); ++idx)
141 v.push_back(
operator[](idx));
143 v.insert(v.end(), m_strings.begin(), m_strings.end());
151 ddcLogWarn(
Format(
"WARNING: ddcCorpusList(\"%s\") demoting to std::vector<> implementation", m_cdata.
filename().c_str()));
169 inline IdT
find(
const string& key)
const 173 for (IdT idx=0; idx <
size(); ++idx) {
180 vector<string>::const_iterator it = std::find(m_strings.begin(), m_strings.end(), key);
181 if (it != m_strings.end())
182 return (it - m_strings.begin());
188 inline IdT
rfind(
const string& key)
const 202 vector<string>::const_reverse_iterator it = std::find(m_strings.rbegin(), m_strings.rend(), key);
203 if (it != m_strings.rend())
204 return (m_strings.size()-1) - (it - m_strings.rbegin());
214 ddcCorpusList&
open(
const string& conFile,
const string& cdataFile=
"",
const string& offsetsFile=
"",
bool useMMap=
false)
218 if (!cdataFile.empty() && !offsetsFile.empty() &&
FileExists(cdataFile) &&
FileExists(offsetsFile)) {
220 m_cdata.
open(cdataFile, useMMap);
221 m_offsets.
open(offsetsFile, useMMap);
222 ddcLogDebug(
Format(
"ddcCorpusList[%s]::open(): read %zi item(s) from %s + %s", (useMMap ?
"mmap" :
"slurp"),
size(), cdataFile.c_str(), offsetsFile.c_str()));
225 ddcLogWarn(
"WARNING: ddcCorpusList::open(): loading corpus list '"+conFile+
"' in compatibility mode: please upgrade your index!");
227 ddcLogDebug(
Format(
"ddcCorpusList[compat]::open(): read %zi item(s) from %s",
size(), conFile.c_str()));
238 FILE * fp = fopen(filename.c_str(),
"rb");
240 throw invalid_argument(
Format(
"ddcCorpusList::openCompat(): open failed for '%s': %s'", filename.c_str(), strerror(errno)));
242 char buf[m_bufsize+1];
243 if (!fgets(buf, m_bufsize, fp))
244 throw invalid_argument(
"ddcCorpusList::openCompat(): failed to read version information from '"+filename+
"'");
250 while (fgets(buf, m_bufsize, fp)) {
253 m_strings.push_back(tmp);
Definition: ddcCorpusList.h:31
void append_vector(vector< string > &v) const
populate v with an equivalent vector-implementation (for compile-time compatibility) ...
Definition: ddcCorpusList.h:136
ddcCorpusList & open(const string &conFile, const string &cdataFile="", const string &offsetsFile="", bool useMMap=false)
map a named file filename
Definition: ddcCorpusList.h:214
void clear()
clear() method wraps close()
Definition: ddcCorpusList.h:68
void close()
unmap current file(s), if any
Definition: ddcCorpusList.h:261
string Format(const char *format,...)
Definition: ddcString.cpp:393
string & Trim(string &str)
Definition: utilit.cpp:1762
void push_back(const string &value)
simulate vector::push_back(): used by indexing routines
Definition: ddcCorpusList.h:129
void ensureVec()
force vector-implementation
Definition: ddcCorpusList.h:148
vector< string > m_strings
backwards-compatible vector<string> implementation, for read/write access
Definition: ddcCorpusList.h:44
bool empty() const
STL-esque wrapper.
Definition: ddcMMap.h:268
DWORD IdT
typedef for integer IDs
Definition: ddcCorpusList.h:37
const string & VersionString() const
version string
Definition: ddcCorpusList.h:88
ddcVecFile & open(const std::string &filename, bool useMMap=false)
map a named file filename
Definition: ddcMMap.h:312
ddcVecFile< char > m_cdata
underlying character data, including initial version string
Definition: ddcCorpusList.h:42
~ddcCorpusList()
default destructor calls close()
Definition: ddcCorpusList.h:64
IdT find(const string &key) const
returns first id of key, or NO_ID if not found (linear search)
Definition: ddcCorpusList.h:169
bool FileExists(const char *FName)
Definition: utilit.cpp:335
OffT itemEndOffset(IdT idx) const
get end-offset of string by index (mmap variant only)
Definition: ddcCorpusList.h:96
OffT itemStartOffset(IdT idx) const
get start-offset of string by index (mmap variant only)
Definition: ddcCorpusList.h:92
size_t size() const
returns the number of enumerated strings
Definition: ddcCorpusList.h:76
ddcCorpusList(const string &conFile, const string &cdataFile="", const string &offsetsFile="", bool useMMap=false)
construct and open
Definition: ddcCorpusList.h:59
void openCompat(const string &filename, bool slurpList=true)
open in compatibility-mode; lifted from CConcIndexator::LoadCorpusFiles()
Definition: ddcCorpusList.h:234
void ClearVector(vector< T > &V)
Definition: utilit.h:493
#define ddcLogWarn(Msg)
Definition: ddcLog.h:106
size_t size() const
returns the number of objects of type T in this vector, or 0
Definition: ddcMMap.h:265
void ClearString(string &S)
Definition: utilit.h:497
size_t m_bufsize
buffer size for fallback (default is old value for MaxBiblStringLen from Bibliography.cpp)
Definition: ddcCorpusList.h:47
bool opened() const
returns true iff m_data is non-NULL
Definition: ddcMMap.h:271
IdT rfind(const string &key) const
returns last id of key, or NO_ID if not found (linear search)
Definition: ddcCorpusList.h:188
bool empty() const
STL-esque wrapper.
Definition: ddcCorpusList.h:80
OffT_ OffT
typedef for offset values
Definition: ddcCorpusList.h:36
bool opened() const
check whether the object is opened
Definition: ddcCorpusList.h:84
void close()
unmap current file (if any)
Definition: ddcMMap.h:330
static const IdT NO_ID
constant returned by string-to-id search methods indicating no match was found
Definition: ddcCorpusList.h:166
string operator[](IdT idx) const
indexing operator (copies)
Definition: ddcCorpusList.h:117
string m_verstr
version string (1st line of *._con file)
Definition: ddcCorpusList.h:45
uint32_t DWORD
Definition: utilit.h:105
const string & filename() const
returns filename of last open(), or NULL
Definition: ddcMMap.h:259
ddcVecFile< OffT > m_offsets
offsets in m_cdata[] of "\n"-terminated filenames (in corpus order)
Definition: ddcCorpusList.h:43
ddcCorpusList()
default constructor
Definition: ddcCorpusList.h:54
#define ddcLogDebug(Msg)
Definition: ddcLog.h:112
void itemGet(IdT idx, string &str) const
pseudo-indexing operator
Definition: ddcCorpusList.h:100