ddc
ConcCommon.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // This file is part of DDC.
4 //
5 // DDC is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU Lesser General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // DDC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
17 //
18 #ifndef __ConcCommon_H_
19 #define __ConcCommon_H_
20 
27 #include "../CommonLib/utilit.h"
28 #include "../CommonLib/ddcLog.h"
29 #include "list"
30 #include "limits.h"
31 
32 
33 #include "../GraphanLib/GraphmatFile.h"
34 #include "../LemmatizerLib/Lemmatizers.h"
35 
36 #include "../AgramtabLib/EngGramTab.h"
37 #include "../AgramtabLib/RusGramTab.h"
38 #include "../AgramtabLib/GerGramTab.h"
39 #include "../CommonLib/DDC_common.h"
40 #include "../CommonLib/ddcObject.h"
41 #include "../tinyxml/tinyxml.h"
42 
43 #include "../ConcordLib/CCurl.h"
44 
46 inline void ddcInitGlobal(void) {
47  ddcInitLocale();
48  ddcInitCurl();
49 }
50 
52 const char globalFieldDelimeter = '\t';
54 const string PredefinedTableLineTag = "l";
56 const string ChunkIndexName = "chunk";
58 const string PredefinedFileBreakName = "file";
60 const string PredefinedTextAreaBreakName = "textarea";
61 
63 typedef DWORD CTokenNo;
64 
66 typedef DWORD CFileNo;
67 
69 const size_t MaxShortOccurCacheSize = 1000000;
70 
73 #define DDC_SORTKEY_MAXLEN 256
74 
75 const size_t MaxBiblStringLen = 20000;
76 
86 {
89  {
90  // the start index of subvector in m_OccurrencesBody
92  // the length of subvector in m_OccurrencesBody
94 
95 
96  CDataReference (size_t VectorStartOffset, size_t VectorLength)
97  {
98  m_VectorStartOffset = VectorStartOffset;
99  m_VectorLength = VectorLength;
100  };
101  };
103  //map<CTokenNo, CDataReference> m_IndexItemNo2Occurrences;
104  vector <CDataReference> m_IndexItemNo2Occurrences;
105 
107  vector<CTokenNo> m_OccurrencesBody;
108 public:
109  void Clear();
110  size_t AddNewIndexItemNoToCache(const CTokenNo* pStart, const CTokenNo* pEnd );
111  const CTokenNo* GetOccurrencesFromCache(const int CacheId, DWORD& Length) const;
112  bool CouldContainMore() const;
113 
114 };
115 
121  NoSort = 0,
166 };
167 
169 { return e==LessByPruneKey || e==GreaterByPruneKey; }
170 
174  hsoNone = 0,
180 };
181 
183 { return e==hsoNone; }
184 
186 { return e >= hsoAscendingCountKeys && e <= hsoDescendingCountValues; }
187 
190 
193 
195 static const char *HitSortEnumNames[HitSortsCount] =
196 {
197  "NoSort",
198  "LessByDate",
199  "GreaterByDate",
200  "LessBySize",
201  "GreaterBySize",
202  "LessByFreeBiblField",
203  "GreaterByFreeBiblField",
204  "LessByRank",
205  "GreaterByRank",
206  "LessByMiddleContext",
207  "GreaterByMiddleContext",
208  "LessByLeftContext",
209  "GreaterByLeftContext",
210  "LessByRightContext",
211  "GreaterByRightContext",
212  "RandomSort",
213  "LessByCountKey",
214  "GreaterByCountKey",
215  "LessByCountValue",
216  "GreaterByCountValue",
217  "LessByPruneKey",
218  "GreaterByPruneKey"
219 };
220 
222 static const char *HitSortEnumStrings[HitSortsCount] = {
223  "no_sort",
224  "asc_by_date",
225  "desc_by_date",
226  "asc_by_size",
227  "desc_by_size",
228  "asc",
229  "desc",
230  "asc_by_rank",
231  "desc_by_rank",
232  "asc_middle",
233  "desc_middle",
234  "asc_left",
235  "desc_left",
236  "asc_right",
237  "desc_right",
238  "random",
239  "asc_by_key",
240  "desc_by_key",
241  "asc_by_count",
242  "desc_by_count",
243  "prune_asc",
244  "prune_desc"
245 };
246 
247 
249 public:
251  {};
252 
253  virtual string GetStringValue(DWORD FileNo) const = 0;
254 
257  virtual DWORD GetIntegerLowerBound(const string &Value) const = 0;
258 };
259 
263 struct CHitSortKey {
265  int i;
266 
268  string s;
269 
271  CHitSortKey(int i_=0) : i(i_)
272  {};
273 
275  CHitSortKey(int i_, const string& s_) : i(i_), s(s_)
276  {};
277 
279  inline void clear()
280  { i=0; s.clear(); };
281 
283  inline bool operator< (const CHitSortKey &x) const
284  { return (i < x.i) || (i==x.i && s < x.s); };
285 
287  inline bool operator> (const CHitSortKey &x) const
288  { return (i > x.i) || (i==x.i && s > x.s); };
289 
291  inline void assign(int i_)
292  { i=i_; s.clear(); };
293 
295  inline void assign(int i_, const string &s_)
296  { i=i_; s=s_; };
297 
299  inline void operator= (const std::pair<int,string>& p)
300  { i=p.first; s=p.second; };
301 
303  inline void operator= (int i_)
304  { i=i_; s.clear(); };
305 
307  inline void operator= (const string &s_)
308  { i=0; s=s_; };
309 };
310 
319 {
322 
325  string m_AttrName;
326 
330  bool m_bSet;
331 
333  bool m_bNegated;
334 
337 
340 
343 
346 
349 
352 
355 
356  inline void clear()
357  {
358  m_SatisfiedValues.clear();
359  };
360 
362  inline HitSortOrderEnum SortOrder() const {
363  switch (m_FilterType) {
364  case GreaterByDate:
365  case GreaterByRank:
366  case GreaterBySize:
371  case GreaterByCountKey:
372  case GreaterByCountValue:
373  case GreaterByPruneKey:
374  return hsoDescending;
375  case NoSort:
376  return hsoNone;
377  default:
378  return hsoAscending;
379  };
380  };
381 
383  inline bool IsFileFilter() const
384  {
385  switch (m_FilterType) {
386  case NoSort:
387  case LessByDate:
388  case GreaterByDate:
389  return true; //-- special handling in CConcXml::FileMatches()
390  case LessByFreeBiblField:
392  return true; //-- usual case of metadata-based filtering
393  case LessByRank:
394  case GreaterByRank:
395  case RandomSort:
396  return true; //-- non-filter operators are ignored
397  case LessByPruneKey:
398  case GreaterByPruneKey:
399  //return !m_Parent || !((CQFPrune*)m_Parent)->m_keys || ((CQFPrune*)m_Parent)->m_keys->canCountByFile(); //-- not declared here
400  case LessBySize:
401  case GreaterBySize:
402  case LessByMiddleContext:
404  case LessByLeftContext:
406  case LessByRightContext:
408  default:
409  return false; //-- not file-based
410  }
411  };
412 
414  inline bool IsTrivialFilter(void) const
415  { return m_FilterType == NoSort; };
416 
418  inline bool IsPruneFilter(void) const
419  { return IsPruneFilterType(m_FilterType); };
420 
421 
423  : m_bSet(false),
424  m_bNegated(false),
425  m_FilterType(NoSort),
426  m_ContextMatchId(0),
427  m_ContextOffset(0),
428  m_KeyLo(INT_MIN),
429  m_KeyHi(INT_MAX),
430  m_BiblIndex(NULL),
431  m_Parent(NULL)
432  {};
433 };
434 
438 struct CHit
439 {
446 
447  union {
450 
452  size_t m_Count;
453  } m_Value;
454 
458 
459  CHit(DWORD BreakNo=0) : m_BreakNo(BreakNo), m_SortKey(0) { m_Value.m_Count=0; };
460 };
461 
474 };
475 
477 typedef map<string,CShortOccurCache> CShortOccurCacheMap;
478 
480 typedef vector<CTokenNo> COccurrBuffer;
481 
483 const string MorphAnnotationsDelim = "#";
485 const string MorphAnnotationsDelimRegExp = "[^#]*";
486 
488 extern bool InitConcordDicts();
490 extern void FreeConcordDicts();
494 extern const CAgramtab* GetGramtabByLanguage (MorphLanguageEnum Langua);
495 
498 {
500 
501  inline bool operator()(const CHit& h, const DWORD brkno) const
502  { return h.m_BreakNo < brkno; };
503 };
504 
506 string GetIndexItemSetByVectorString(const vector<string> &TokenProperties, bool bRegexp);
507 
508 #endif
509 
510 /*--- emacs style variables ---
511  * Local Variables:
512  * mode: C++
513  * c-file-style: "ellemtel"
514  * c-basic-offset: 4
515  * tab-width: 8
516  * indent-tabs-mode: nil
517  * End:
518  */
bool CouldContainMore() const
Definition: ConcSession.cpp:244
vector< CTokenNo > COccurrBuffer
a type for holding occurrences during reading from the disk
Definition: ConcCommon.h:480
const string ChunkIndexName
a globally defined index name for chunks
Definition: ConcCommon.h:56
sort by the issue date (ascending)
Definition: ConcCommon.h:123
bool operator()(const CHit &h, const DWORD brkno) const
Definition: ConcCommon.h:501
DDCFormatTypeEnum
FormatTypeEnum defines the format of output hits:
Definition: ConcCommon.h:468
sort by central context (ascending)
Definition: ConcCommon.h:139
const CTokenNo * GetOccurrencesFromCache(const int CacheId, DWORD &Length) const
Definition: ConcSession.cpp:238
CHitSortKey(int i_=0)
default constructor
Definition: ConcCommon.h:271
void FreeConcordDicts()
deletes morphology dictionaries
Definition: InitDicts.cpp:151
CHitSortKey m_KeyHi
the upper bound of the filter (by default (INT_MAX,"")), valid only if m_bSet==false; formerly int m_...
Definition: ConcCommon.h:345
DWORD CFileNo
integer type CFileNo is used to refer to a single document (file) in the corpus
Definition: ConcCommon.h:66
vector< CTokenNo > m_OccurrencesBody
this vector contains all occurrences for this cache
Definition: ConcCommon.h:107
string s
secondary sort key, new for v2.0.19
Definition: ConcCommon.h:268
void assign(int i_, const string &s_)
assignment given integer + string
Definition: ConcCommon.h:295
Definition: ConcCommon.h:470
Definition: ConcCommon.h:472
const CLemmatizer * GetLemmatizerByLanguage(MorphLanguageEnum Langua)
return a morphology dictionary by a language indentifier
Definition: InitDicts.cpp:75
const string PredefinedTableLineTag
a globally defined xml-tag, which is used to separate records if CConcIndexator::m_IndexType is Free_...
Definition: ConcCommon.h:54
sort by count()-key (ascending)
Definition: ConcCommon.h:153
bool IsNullSort(HitSortOrderEnum e)
Definition: ConcCommon.h:182
bool IsCountValueSort(HitSortOrderEnum e)
Definition: ConcCommon.h:191
bool IsPruneFilterType(HitSortEnum e)
Definition: ConcCommon.h:168
DWORD m_HighlightOccurrenceEnd
the end offset of token occurrences to be highlighted in CQueryNode::m_Occurrences and later in CConc...
Definition: ConcCommon.h:443
sort by count()-value (descending)
Definition: ConcCommon.h:159
class for global query filters aka "query operators"
Definition: QueryFilter.h:35
CHitSortKey m_KeyLo
the lower bound of the filter (by default (INT_MIN,"")), valid only if m_bSet==false; formerly int m_...
Definition: ConcCommon.h:342
sort by the size of the hit in tokens (descending)
Definition: ConcCommon.h:129
Definition: ConcCommon.h:473
bool IsCountSort(HitSortOrderEnum e)
Definition: ConcCommon.h:185
sort by right context (ascending)
Definition: ConcCommon.h:147
end-of-enum sentinel
Definition: ConcCommon.h:165
size_t m_DebugRankNo
the string which displays how the rank was calculated (for rank-sorted queries)
Definition: ConcCommon.h:449
const string MorphAnnotationsDelimRegExp
a regular expression, which passes everything within one morphological annotation ...
Definition: ConcCommon.h:485
const CFreeBiblIndexInterface * m_BiblIndex
pointer to the CConcXml::CFreeBiblIndex responsible for populating this filter (only after compile) ...
Definition: ConcCommon.h:351
HitSortOrderEnum
Definition: ConcCommon.h:172
sort by left context (descending)
Definition: ConcCommon.h:145
Definition: ConcCommon.h:177
int m_ContextMatchId
match-id of reference token for context-sort operators (default=0:any)
Definition: ConcCommon.h:336
Definition: agramtab_.h:39
CHitSortKey(int i_, const string &s_)
constructor given integer and string
Definition: ConcCommon.h:275
static const char * HitSortEnumStrings[HitSortsCount]
Definition: ConcCommon.h:222
Definition: ConcCommon.h:248
sort by #prune[]-key (descending)
Definition: ConcCommon.h:163
size_t m_VectorLength
Definition: ConcCommon.h:93
sort by count()-value (ascending)
Definition: ConcCommon.h:157
void Clear()
Definition: ConcSession.cpp:225
bool m_bNegated
true iff this is a negated filter
Definition: ConcCommon.h:333
HitSortEnum m_FilterType
the type of the filter
Definition: ConcCommon.h:321
CHit(DWORD BreakNo=0)
Definition: ConcCommon.h:459
Definition: ConcCommon.h:176
size_t AddNewIndexItemNoToCache(const CTokenNo *pStart, const CTokenNo *pEnd)
Definition: ConcSession.cpp:231
bool m_bSet
Definition: ConcCommon.h:330
Definition: ConcCommon.h:175
CHitCompareByBreak()
Definition: ConcCommon.h:499
Definition: ConcCommon.h:85
bool IsFileFilter() const
returns true iff this is a file-based filter (for optimized count(*) queries)
Definition: ConcCommon.h:383
map< string, CShortOccurCache > CShortOccurCacheMap
a type for index string to its occurrences
Definition: ConcCommon.h:477
const size_t MaxShortOccurCacheSize
MaxShortOccurCacheSize is the upper bound of CShortOccurCache::m_Data.size() It is introduced to rest...
Definition: ConcCommon.h:69
Definition: ConcCommon.h:174
compare hits by break-number (for query evaluation, e.g. CQueryBinaryOperationNode::hits_and_position...
Definition: ConcCommon.h:497
const string PredefinedTextAreaBreakName
a globally defined break collection name for text areas
Definition: ConcCommon.h:60
bool IsPruneFilter(void) const
returns true iff this is a pruning filter
Definition: ConcCommon.h:418
sort by document rank (descending)
Definition: ConcCommon.h:137
CDataReference(size_t VectorStartOffset, size_t VectorLength)
Definition: ConcCommon.h:96
Definition: ConcCommon.h:263
void ddcInitLocale(void)
initialize the locale from current environment if not already initialized
Definition: ddcLocale.cpp:31
sort by document (ascending)
Definition: ConcCommon.h:135
const string PredefinedFileBreakName
a globally defined break collection name for corpus files
Definition: ConcCommon.h:58
sort by right context (descending)
Definition: ConcCommon.h:149
sort by a free bibliographical field (ascending)
Definition: ConcCommon.h:131
const string MorphAnnotationsDelim
a delimiter between morphological annotations
Definition: ConcCommon.h:483
vector< CDataReference > m_IndexItemNo2Occurrences
this map contains a relation from index item No to the address of its occurrences ...
Definition: ConcCommon.h:104
virtual ~CFreeBiblIndexInterface()
Definition: ConcCommon.h:250
sort by count()-key (descending)
Definition: ConcCommon.h:155
DWORD m_BreakNo
the index of the break, which this hit represents (in the break collection CConcHolder::GetBreaks) ...
Definition: ConcCommon.h:441
sort by #prune[]-key (ascending)
Definition: ConcCommon.h:161
CDDCFilterWithBounds()
Definition: ConcCommon.h:422
class CQFilter * m_Parent
pointer to parent CQFilter (if any)
Definition: ConcCommon.h:354
DWORD m_FileNo
the index of coprus file, where this hit is found, it is equal to m_BreakNo if user searches within f...
Definition: ConcCommon.h:445
string m_AttrName
Definition: ConcCommon.h:325
HitSortOrderEnum SortOrder() const
returns integer sort order as a HitSortOrderEnum (-1:descending, 0:none, 1:ascending, 2:count_keys, 3:count_values)
Definition: ConcCommon.h:362
sort by the size of the hit in tokens (ascending)
Definition: ConcCommon.h:127
Definition: ConcCommon.h:178
Definition: ConcCommon.h:438
void clear()
clear key
Definition: ConcCommon.h:279
const size_t MaxBiblStringLen
Definition: ConcCommon.h:75
bool IsCountKeySort(HitSortOrderEnum e)
Definition: ConcCommon.h:188
Definition: ConcCommon.h:318
bool IsTrivialFilter(void) const
returns true iff this is a trivial-sort filter (i.e. does not change original hit-sort order) ...
Definition: ConcCommon.h:414
const CAgramtab * GetGramtabByLanguage(MorphLanguageEnum Langua)
return a grammatical table by a language indentifier
Definition: InitDicts.cpp:101
sort by match context (descending)
Definition: ConcCommon.h:141
bool InitConcordDicts()
initializes morphology dictionaries
Definition: InitDicts.cpp:125
CHitSortKey m_SortKey
Definition: ConcCommon.h:457
const char globalFieldDelimeter
a globally defined delimeter, which is used to delimit fields in one record (the first field is alway...
Definition: ConcCommon.h:52
size_t m_Count
count for this item (for count-queries)
Definition: ConcCommon.h:452
int i
primary integer sort key; formerly CHit::m_OrderId, CDDCFilterWithBounds.m_LevelStart|m_LevelEnd ...
Definition: ConcCommon.h:265
sort by a free bibliographical field (descending)
Definition: ConcCommon.h:133
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
MorphLanguageEnum
Definition: utilit.h:162
Definition: Lemmatizers.h:37
Definition: ConcCommon.h:471
static const char * HitSortEnumNames[HitSortsCount]
Definition: ConcCommon.h:195
void clear()
Definition: ConcCommon.h:356
size_t m_VectorStartOffset
Definition: ConcCommon.h:91
uint32_t DWORD
Definition: utilit.h:105
sort by left context (ascending)
Definition: ConcCommon.h:143
sort by the issue date (descending)
Definition: ConcCommon.h:125
int m_ContextOffset
offset from matched token for context-sort operators
Definition: ConcCommon.h:339
Definition: ConcCommon.h:179
HitSortEnum
Definition: ConcCommon.h:119
sort by random key
Definition: ConcCommon.h:151
Definition: ConcCommon.h:469
the structure holds a pointer to a vector of occurrences and its size
Definition: ConcCommon.h:88
void assign(int i_)
assignment given integer
Definition: ConcCommon.h:291
string GetIndexItemSetByVectorString(const vector< string > &TokenProperties, bool bRegexp)
return a string representation of a set of token properties (in the format which is used in the index...
Definition: Concordance.cpp:25
Definition: ConcCommon.h:173
no sort operators, only filtering (used by #has_field[])
Definition: ConcCommon.h:121
void ddcInitCurl(void)
global initialization function (multiple calls should be safe)
Definition: CCurl.cpp:83
void ddcInitGlobal(void)
global intialization
Definition: ConcCommon.h:46
set< int > m_SatisfiedValues
the possible (integer) values for this bibliographical field, valid only if m_bSet==true ...
Definition: ConcCommon.h:348