ddc
Concordance.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2019 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 
29 #ifndef Concordance_h
30 #define Concordance_h
31 
32 
33 #include "ConcCommon.h"
34 #include "ConcXml.h"
35 #include "StringIndexator.h"
36 #include "HitBorder.h"
37 #include "TermExpander.h"
38 #include "../CommonLib/ddcCorpusList.h"
39 
40 
45  string m_FirstOpener;
47  string m_FirstCloser;
49  string m_RestOpener;
51  string m_RestCloser;
52 
54 
55  bool ReadFromString(const string &s);
56 
57  string ToString() const;
58 };
59 
60 
66 protected:
68  vector<string> m_SourceFiles;
70 
71 public:
73 
75  bool SaveSourceFileList(string FileName);
76 
78  void DeleteSourceFile(long ItemNo);
79 
81  void AddSourceFile(const char *FileName);
82 
84  void DeleteAllSourceFiles();
85 
89  size_t GetSourceFilesCount() const;
90 
92  string GetSourceFile(size_t FileNo) const;
93 
94  void AddSourceFilesFrom(const CSourceFileHolder &X);
95 
96  void ReadSourceFileList(string FileName);
97 
99  int FoundNotExistedFile() const;
100 
101 
102  bool IsModified() const;
103 
104 };
105 
106 
107 class CDwdsThesaurus;
108 
109 /* CIndexDumpFormat : enum for index-dump formats */
110 typedef enum CIndexDumpFormatE {
116 
117 /* CMaskedFileSet : type for CConcIndexator::m_MaskedFiles */
118 typedef set<CFileNo> CMaskedFileSet;
119 
127 
128 
130  vector<BYTE> m_PcreCharacterTables;
131 
140 
144 
145 
146 
147 
148  string GetBiblIndexFileName() const;
149 
150  string GetBiblFileName() const;
151 
152 
153 
155  bool HasEqualOptions(const CConcordance &X) const;
156 
158  const char *GetIndexTypeStr() const;
159 
161  bool ReadIndexTypeFromStr(const string &s);
162 
164  void DumpFileIndexTabs(DWORD FileNo, FILE *f) const;
165 
166 protected:
191  TabFormat_Index
192 
193  };
210 
211  void AssertHasPath() const;
212 
214  void LoadOptionsFromString(string Options);
215 
216  void InitDefaultOptions();
218  string SaveOptionsToString() const;
219 
220 public:
237 
240 
242 
245 
248 
251 
254 
264  double m_TfIdfRank;
266  double m_NearRank;
273 
275  bool m_Utf8;
276 
279 
282 
285 
287  vector<size_t> m_IndicesToShow;
288 
290  time_t m_Timestamp;
291 
294 
297 
300 
314  map<string, string> m_OpDefaultIndexNames;
315 
320  map<string, pair<bool, string> > m_ServerInfo;
321 
322  CConcordance();
323 
324  ~CConcordance();
325 
327  void RegisterIndicesToShow(const string &IndexListStr);
328 
330  string GetIndicesToShowStr(bool Positional = false) const;
331 
332  /*
334  //const vector<BYTE>& GetRegExpTables() const { if (m_Utf8) return NULL; return m_PcreCharacterTables; };
335  const BYTE* GetRegexTables() const { if (m_Utf8) return NULL; return m_PcreCharacterTables.data(); };
336 
338  const pcrecpp::RE_Options GetRegexOptions() const { if (m_Utf8) return pcrecpp::UTF8(); return pcrecpp::RE_Options(); }
339  */
342  if (m_Utf8)
343  return RML_RE::Options(pcrecpp::UTF8(), NULL);
344  else
345  return RML_RE::Options(pcrecpp::RE_Options(), m_PcreCharacterTables.data());
346  };
347 
349  bool IsDwdsCorpusInterface() const { return m_bDwdsCorpusInterface; };
350 
352  bool IsGutenbergInterface() const { return m_bGutenbergInterface; };
353 
355  bool HasContextOperator() const { return !m_bNoContextOperator; };
356 
358  bool UseDwdsThesaurus() const { return m_bUseDwdsThesaurus; };
359 
361  bool OutputBibliographyOfHits() const { return m_bOutputBibliographyOfHits; };
362 
364  bool IndexPunctuation() const { return m_bIndexPunctuation; };
365 
367  string GetHtmlReference(size_t posFile) const;
368 
370  string GetShortFilename(size_t posFile) const;
371 
373  string GetFileNameForCorpusFileNames() const;
374 
376  string GetFileNameForMaskedFiles() const;
377 
379  string GetFileNameForMaskedFileIds() const;
380 
382  vector<string> GetTokenFields(const COutputToken &tok);
383 
385  inline size_t GetCorpusFilesCount() const { return m_CorpusFiles.size(); };
386 
388  inline string GetCorpusFile(CFileNo FileNo) const { return m_CorpusFiles[FileNo]; };
389 
391  inline size_t GetMaskedFilesCount() const { return m_MaskedFiles.size(); };
392 
394  void LoadSourceFilesAndOptions(string FileName, bool reallyReadSourceFiles = true);
395 
397  void LoadCorpusFiles();
398 
400  string GetCommonFilePrefix() const;
401 
403  void LoadMaskedFiles();
404 
406  void LoadOptionsFromFile(const string &OptFile);
407 
409  void LoadProject(string FileName, bool includeSourceFiles = true);
410 
412  time_t UpdateTimestamp(const char *filename);
413 
415  void DumpBibliography(FILE *f = stdout) const;
416 
418  string DumpFileBibliography(DWORD FileNo) const;
419 
421  void DumpFileIndexJson(DWORD FileNo, FILE *f = stdout) const;
422 
423 
425  void DumpIndex(string dirname, CIndexDumpFormat fmt = idfJson) const;
426 
428  void DumpIndexToSingleTabFile(FILE *outfp) const;
429 
431  bool SaveOptions(string FileName) const;
432 
434  bool GetAllOccurrences(vector<CTokenNo> &occurrences, size_t searchPeriodNo) const;
435 
437  bool GetOccurrencesByPosition(const string &BreakName, int anchor, vector<CTokenNo> &occurrences,
438  size_t searchPeriodNo) const;
439  bool UseTabFormatForLoading () const {return m_IndexType == TabFormat_Index;}
440 };
441 
442 
443 #endif
444 
445 /*--- emacs style variables ---
446  * Local Variables:
447  * mode: C++
448  * c-file-style: "ellemtel"
449  * c-basic-offset: 4
450  * tab-width: 8
451  * indent-tabs-mode: nil
452  * End:
453  */
string m_RestCloser
the end tag which should highlight the next found occurrences in a hit (except the first occurrence) ...
Definition: Concordance.h:51
bool m_bShowNumberOfRelevantDocuments
if true, then DDC always calculates the number of documents, where at lease one hit is found ...
Definition: Concordance.h:230
size_t GetMaskedFilesCount() const
get count of masked files
Definition: Concordance.h:391
Definition: ddcCorpusList.h:31
DWORD m_UserMaxInputLoadIndexSize
The maximal number of occurrences in the input load index, by default 400000.
Definition: Concordance.h:207
bool m_bDwdsCorpusInterface
if m_bDwdsCorpusInterface is on, the program outputs results in DWDS format
Definition: Concordance.h:133
CHighlightTags()
Definition: Concordance.cpp:128
DWORD CFileNo
integer type CFileNo is used to refer to a single document (file) in the corpus
Definition: ConcCommon.h:66
map< string, string > m_OpDefaultIndexNames
maps token-query operators to default index names; keys are as returned by CQToken::OperatorKey() ...
Definition: Concordance.h:314
Definition: Concordance.h:41
size_t GetCorpusFilesCount() const
get the number of indexed corpus files
Definition: Concordance.h:385
Definition: pcre_rml.h:46
CHighlightTags m_TextHighlighting
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultText
Definition: Concordance.h:250
size_t m_MaxQueryCacheSize
moo: maximum number of queries to be cached by an associated CConcHolder (default=512) ...
Definition: Concordance.h:296
bool m_bUseDwdsThesaurus
Enables indexing and querying using DWDS Thesaurus.
Definition: Concordance.h:209
Definition: HitBorder.h:67
CHighlightTags m_HtmlHighlighting
highlighting tags for CConcHolder::m_ResultFormat == DDC_ResultHTML
Definition: Concordance.h:247
bool m_bUseParagraphTagToDivide
Enables using "<p>" tag as a paragraph delimiter.
Definition: Concordance.h:199
CHighlightTags m_TableHighlighting
highlighting delimeters for CConcHolder::m_ResultFormat == DDC_ResultTable
Definition: Concordance.h:253
Definition: StringIndexator.h:121
Definition: Concordance.h:111
string ToString() const
Definition: Concordance.cpp:151
A file for globally defined constants and classes.
string m_InternetPathPrefix
Definition: Concordance.h:141
bool m_bWasReadFromString
true, if members were initialized from string (via CHighlightTags::ReadFromString) ...
Definition: Concordance.h:43
bool m_bQueryOnlyFiles
prohibits sentence break collection under DWDS_Index or MorphXML_Index
Definition: Concordance.h:232
A type for corpus without annotations, which are written for each word. Fr example the input text can...
Definition: Concordance.h:175
string GetCorpusFile(CFileNo FileNo) const
get corpus file by index
Definition: Concordance.h:388
bool m_bIndexPunctuation
Enables indexing all punctuation marks.
Definition: Concordance.h:197
bool m_bLemmaQueryUsesMorphPattern
interpret "%foo" queries using MorphPattern? (default=true)
Definition: Concordance.h:284
bool m_bGutenbergInterface
if m_bGutenbergInterface is on, the program outputs results in a format of Gutenberg project ...
Definition: Concordance.h:135
This index type is free and therefore it should be defined in the options file (fields "Indices" and ...
Definition: Concordance.h:187
bool UseTabFormatForLoading() const
Definition: Concordance.h:439
bool m_bEmptyLineIsSentenceDelim
if m_bEmptyLineIsSentenceDelim is on, every empty line in the input file is considered to be the end ...
Definition: Concordance.h:201
bool OutputBibliographyOfHits() const
return true, if DDC should output bibliographical information for hits instead of corpus file names ...
Definition: Concordance.h:361
A type for xml-texts, if their words have predefined and written annotations. DDC always builds a tok...
Definition: Concordance.h:179
string m_FirstCloser
the end tag which should highlight the first found occurrence in a hit
Definition: Concordance.h:47
vector< string > m_SourceFiles
Source files
Definition: Concordance.h:68
CConcXml m_BiblIndex
a member which holds a index for bibliographical information
Definition: Concordance.h:244
bool m_bAllowCountByTokenAttributes
using any token attribute as a count-key will throw an exception unless this is true (default=true) ...
Definition: Concordance.h:281
bool m_bOutputBibliographyOfHits
Should we show bibliography of the hits instead of filename.
Definition: Concordance.h:139
map< string, pair< bool, string > > m_ServerInfo
maps symbolic keys to string constants to be included in corpus &#39;info&#39; response as info...
Definition: Concordance.h:320
double m_TfIdfRank
the parameter for TfIdf ranking
Definition: Concordance.h:264
bool m_bDisableDefaultQueryLexicalExpansion
if true, then no default lexical expansion fo querz words occurs
Definition: Concordance.h:256
Definition: Concordance.h:126
Definition: Concordance.h:113
RML_RE::Options GetRegexOptions() const
return default pcre regex options
Definition: Concordance.h:341
string m_LocalPathPrefix
Definition: Concordance.h:142
CIndexDumpFormatE
Definition: Concordance.h:110
size_t size() const
returns the number of enumerated strings
Definition: ddcCorpusList.h:76
string m_RestOpener
the start tag which should highlight the next found occurrences in a hit (except the first occurrence...
Definition: Concordance.h:49
Definition: Concordance.h:112
ddcCorpusList m_CorpusFiles
Corpus files
Definition: Concordance.h:239
bool IndexPunctuation() const
wrapper for m_bIndexPunctuation
Definition: Concordance.h:364
DWORD m_UserMaxTokenCountInOnePeriod
The maximal number of occurrences in one subcorpora (defined by user)
Definition: Concordance.h:205
size_t m_MaxCachedHitsCount
moo: maximum number of hits in a CConcHolder cache entry – query results with more than MaxCachedHit...
Definition: Concordance.h:293
bool IsDwdsCorpusInterface() const
return true, if DDC outputs results in DWDS format
Definition: Concordance.h:349
DDCIndexTypeEnum m_IndexType
the type of index
Definition: Concordance.h:195
bool ReadFromString(const string &s)
Definition: Concordance.cpp:132
MorphLanguageEnum m_Language
the language of the corpus
Definition: Concordance.h:222
int m_LeftKwicContextSize
the size of the left context of the highlighted words in document search
Definition: Concordance.h:258
bool UseDwdsThesaurus() const
return true, if DWDS thesaurus is enabled (index "Thes")
Definition: Concordance.h:358
bool m_bUseIndention
if m_bUseIndention is on, the program tries to find paragraphs using indentions
Definition: Concordance.h:203
string m_InterpDelimiter
delimiter to use between token index fields in output
Definition: Concordance.h:270
vector< size_t > m_IndicesToShow
indices to show for Free_Index
Definition: Concordance.h:287
Definition: Concordance.h:114
CMaskedFileSet m_MaskedFiles
Definition: Concordance.h:241
bool m_bAllowUnsafeQueries
potentially unsafe queries will throw an exception unless this is true (default=false) ...
Definition: Concordance.h:278
double m_NearRank
the parameter for Near ranking
Definition: Concordance.h:266
int m_RightKwicContextSize
the size of the right context of the highlighted words in document search
Definition: Concordance.h:260
bool HasContextOperator() const
return true, if query context operator (#Cntxt) is switched off
Definition: Concordance.h:355
bool m_bResumeOnIndexErrors
if true, CConcIndexatorInvoker skips source documents with errors
Definition: Concordance.h:236
bool IsGutenbergInterface() const
return true, if DDC outputs results in Gutenberg project format
Definition: Concordance.h:352
enum CIndexDumpFormatE CIndexDumpFormat
bool m_bIndexChunks
Enables indexing and querying using chunks.
Definition: Concordance.h:226
bool m_bCaseSensitive
if true, then the default search is case sensitive
Definition: Concordance.h:228
string m_CommonFilePrefix
Definition: Concordance.h:143
int m_NumberOfKwicLinesInSnippets
the maximal number of kwic lines in file snippets
Definition: Concordance.h:262
bool m_Utf8
whether to assume indexed data is utf8 encoded (default=no)
Definition: Concordance.h:275
Definition: DwdsThesaurus.h:25
Definition: Concordance.h:65
MorphLanguageEnum
Definition: utilit.h:162
time_t m_Timestamp
moo: timestamp of project *._con file
Definition: Concordance.h:290
bool m_bIndexMorphPatterns
Enables the index of morph patterns.
Definition: Concordance.h:224
double m_PositionRank
the parameter for Position ranking
Definition: Concordance.h:268
uint32_t DWORD
Definition: utilit.h:105
Definition: IndexSet.h:31
set< CFileNo > CMaskedFileSet
Definition: Concordance.h:118
TxDispatcher: name-based expansion dispatcher.
Definition: TermExpander.h:325
DDCIndexTypeEnum
enum DDCIndexTypeEnum contains index types. Each index type determines DDC indices and break collecti...
Definition: Concordance.h:170
bool m_bArchiveIndex
sets that index should be archived under DWDS_Index or MorphXML_Index
Definition: Concordance.h:234
Definition: ConcXml.h:39
bool m_bNoContextOperator
should we switch off context operator (#Cntxt) due copyright
Definition: Concordance.h:137
string m_TokenDelimiter
delimiter to use between tokens in output
Definition: Concordance.h:272
vector< BYTE > m_PcreCharacterTables
a table of character properties for regular expressions which depend on CConcIndexator::m_Language ...
Definition: Concordance.h:130
string m_FirstOpener
the start tag which should highlight the first found occurrence in a hit
Definition: Concordance.h:45
bool m_bModifiedListOfFiles
Definition: Concordance.h:69
TxDispatcher m_Txd
term expansion dispatcher; should define at least an entry for "default"
Definition: Concordance.h:299