ddc
ConcIndexator.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2020 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 
29 #ifndef ConcIndexator_h
30 #define ConcIndexator_h
31 
32 
33 #include "TabFormatIndexator.h"
34 
37 
40 
45 
48 
51 
52 
53  void IndexTextOrHtmlFile(CIndexDocument* document);
54 
55  void IndexMorphXml(CIndexDocument* document);
56 
57  void IndexFreeIndex(CIndexDocument* document);
58 
59  void IndexOneTableTextArea(const string &Text, const CPageNumber &StartPageFromHeader, size_t &page_breaks_count,
60  CIndexDocument* document);
61 
63  bool IsDWDSToken(long GraLine) const;
64 
65  void LoadXmlFile(string FileName, const char *pFileBuffer, CBibliography &Bibl);
66 
67  void LoadFileIntoGraphan(string FileName, const char *pFileBuffer, CBibliography &Bibl);
68 
69 
70 public:
72 
74 
76  void InitGraphan() ;
77 
78 
80  bool SaveCorpusFileList() const;
81 
83  bool SaveMaskedFileIds() const;
84 
86  void StartIndexing();
87 
89  void DestroyIndex();
90 
92  void NormalEndIndexing();
93 
95  void TerminateIndexing();
96 
98  void IndexOneFile(CIndexDocument* document);
99 
101  void RollbackIndexOneFile(CTokenNo startTrimTokenNo);
102 
104  void CalculateSearchPeriods(DWORD MaxTokenCountInOnePeriod);
105 
107  bool CreateAsUnion(vector<CConcIndexator* > &X, bool inheritOptions = true);
108 
110  bool CreateAsUnion(const vector<string> &Xfiles, bool inheritOptions = true);
111 
113  bool SplitProject(vector<CConcIndexator*> &Subs) const;
114 
116  bool SplitProject(vector<CConcIndexator*> &Subs, const string &SubOptions) const;
117 
120 
123 
126 
127 };
128 
129 
130 #endif
131 
132 /*--- emacs style variables ---
133  * Local Variables:
134  * mode: C++
135  * c-file-style: "ellemtel"
136  * c-basic-offset: 4
137  * tab-width: 8
138  * indent-tabs-mode: nil
139  * End:
140  */
const DWORD DefaultMaxTokenCountInOnePeriod
global default value (5000000) for CConcIndexator::m_UserMaxTokenCountInOnePeriod ...
Definition: ConcIndexator.cpp:29
Definition: HitBorder.h:47
void IndexTextOrHtmlFile(CIndexDocument *document)
Definition: ConcIndexator.cpp:473
void IndexFreeIndex(CIndexDocument *document)
Definition: ConcIndexator.cpp:408
Definition: DocumentIterator.h:15
void IndexOneFile(CIndexDocument *document)
index one file according to m_IndexType
Definition: ConcIndexator.cpp:631
Definition: Bibliography.h:13
CConcIndexator()
Definition: ConcIndexator.cpp:55
bool SaveCorpusFileList() const
saves corpus file list (*._con)
Definition: ConcIndexator.cpp:93
bool SaveMaskedFileIds() const
saves masked file-ids (*._masked_ids)
Definition: ConcIndexator.cpp:133
const DWORD DefaultMaxInputLoadIndexSize
global default value for (400000) CConcIndexator::m_UserMaxInputLoadIndexSize (must be <= DefaultMaxT...
Definition: ConcIndexator.cpp:31
void InitGraphan()
initializes graphematics using current options
Definition: ConcIndexator.cpp:76
void CalculateSearchPeriods(DWORD MaxTokenCountInOnePeriod)
finds all subcorpora
Definition: ConcIndexator.cpp:296
const CDwdsThesaurus * m_pDwdsThesaurus
a reference to DWDS thesaurus if applicable
Definition: ConcIndexator.h:50
CGraphmatFile Graphmat
graphmat (tokenization parser)
Definition: ConcIndexator.h:47
void TerminateIndexing()
terminates indexing (for exceptions)
Definition: ConcIndexator.cpp:618
void StartIndexing()
begins indexing
Definition: ConcIndexator.cpp:606
void IndexMorphXml(CIndexDocument *document)
Definition: IndexMorphXml.cpp:25
DWORD GetMaxTokenCountInOnePeriod() const
returns the size of one subcorpus
Definition: ConcIndexator.cpp:168
void LoadXmlFile(string FileName, const char *pFileBuffer, CBibliography &Bibl)
Definition: ConcIndexator.cpp:249
void NormalEndIndexing()
finishes indexing (normal way)
Definition: ConcIndexator.cpp:624
void RollbackIndexOneFile(CTokenNo startTrimTokenNo)
rollback data buffered by an immediate preceding failed IndexOneFile()
Definition: ConcIndexator.cpp:651
DWORD GetMaxInputLoadIndexSize() const
returns the max size of input index in tokens must be less than GetMaxTokenCountInOnePeriod() ...
Definition: ConcIndexator.cpp:175
void LoadFileIntoGraphan(string FileName, const char *pFileBuffer, CBibliography &Bibl)
Definition: ConcIndexator.cpp:446
bool CreateMorphIndexWrapper()
creates morphology index
Definition: ConcIndexator.cpp:647
Definition: DwdsThesaurus.h:25
bool SplitProject(vector< CConcIndexator *> &Subs) const
split project uniformly into sub-projects (new; sub-projects inherit parent options) ...
Definition: ConcordSplit.cpp:24
Definition: ConcIndexator.h:44
bool IsDWDSToken(long GraLine) const
graphematical definition of a token for DWDSIndex
Definition: ConcIndexator.cpp:66
Definition: TabFormatIndexator.h:14
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
~CConcIndexator()
Definition: ConcIndexator.cpp:62
uint32_t DWORD
Definition: utilit.h:105
Definition: GraphmatFile.h:28
void DestroyIndex()
destroy all index files
Definition: ConcIndexator.cpp:139
void IndexOneTableTextArea(const string &Text, const CPageNumber &StartPageFromHeader, size_t &page_breaks_count, CIndexDocument *document)
Definition: ConcIndexator.cpp:316
bool CreateAsUnion(vector< CConcIndexator * > &X, bool inheritOptions=true)
creates new concordance as union of one or more concordances (new)
Definition: ConcordUnion.cpp:24