ddc
HitBorder.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 //
3 // DDC originally by Alexey Sokirko
4 // Changes and modifications 2011-2018 by Bryan Jurish
5 //
6 // This file is part of DDC.
7 //
8 // DDC is free software: you can redistribute it and/or modify
9 // it under the terms of the GNU Lesser General Public License as published by
10 // the Free Software Foundation, either version 3 of the License, or
11 // (at your option) any later version.
12 //
13 // DDC is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public License
19 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
20 //
21 #ifndef HitBorder_h
22 #define HitBorder_h
23 
24 #include "../CommonLib/ddcMMap.h"
25 
44 
47 struct CPageNumber {
52 
53  CPageNumber(CTokenNo StartTokenNo = 0, DWORD PageNumber = 0)
54  : m_StartTokenNo(StartTokenNo), m_PageNumber(PageNumber) {};
55 
56  inline bool operator<(const CPageNumber &X) const {
57  return ((m_StartTokenNo < X.m_StartTokenNo)
58  || (m_StartTokenNo == X.m_StartTokenNo && m_PageNumber < X.m_PageNumber));
59  };
60 };
61 
62 
67 class CHitBorders {
68 public:
74  string m_ShortName;
76  string m_LongName;
77 
79  mutable FILE *m_FileForIndexing;
80 
83 
84  CBreakCollection(const string &ShortName, const string &LongName);
85 
87  string GetBreakFileName(string Path) const;
88 
89  void ReadFromDisk(string Path, bool useMMap = false);
90 
91  bool ClearAll(string Path);
92 
93  void CloseFileForIndexing();
94 
96  void AppendBreaks(const ddcBreakVector &From, CTokenNo Offset = 0);
97 
99  void SaveToFile(const string &FileName);
100  };
101 
102 protected:
104  vector<CBreakCollection> m_Breaks;
105 
107  map<string, int> m_ShortName2BreakCollection;
108 
110  map<string, int> m_LongName2BreakCollection;
111 
116 
119 
120 
122  string GetPageBreaksFileName(string Path) const;
123 
125  string GetShortNameByName(const string &BreakName) const;
126 
128  vector<DWORD> m_LastTextAreaBreaks;
129 
130 protected:
132  bool StartIndexing(string Path);
133 
135  bool RemoveHitBordersFileAndClear(string Path);
136 
138  void AddPageBreak(const CPageNumber &P);
139 
141  void SavePageBreaks(const string &ProjectPath);
142 
143  int RegisterBreak(string ShortName, string LongName);
144 
145  int EnsureRegisteredBreak(string ShortName, string LongName);
146 
147  int GetBreakCollectionIndexByName(string ShortName) const;
148 
149  void AddBreakByIndex(DWORD BreakCollectionNo, const CTokenNo &B);
150 
151 public:
153  const CBreakCollection *GetBreakCollectionByName(const string &Name) const;
154 
155 
157  const vector<CBreakCollection> &GetBreaks(void) const { return m_Breaks; };
158 
159 public:
160 
161  CHitBorders();
162 
164  string GetBorderIndicesString() const;
165 
169  string WithinBreakName(const vector<string> &Within) const;
170 
172  const ddcBreakVector *GetBreaksByName(const string &ShortName) const;
173 
175  CTokenNo GetCorpusEndTokenNo() const;
176 
178  const ddcBreakVector &GetFileBreaks() const;
179 
181  CTokenNo GetFileStartTokenNo(size_t FileNo) const;
182 
184  DWORD GetPageNumber(size_t No) const;
185 
187  bool IsRegisteredBreak(const string &ShortName) const;
188 
190  void RegisterBorderIndices(const char *IndicesStr);
191 
193  bool LoadHitBorders(string Path, bool useMMap = false);
194 
196  void ConvertHitsToPageBreaks(vector<CHit>::const_iterator hits_begin, vector<CHit>::const_iterator hits_end,
197  const ddcBreakVector &Breaks, DwordVector &PageBreaks) const;
198 
200  ddcVecFile<CPageNumber>::const_iterator GetTokenPageBreak(CTokenNo tok) const;
201 
203  void AddBreakByName(const string &ShortName, const CTokenNo &B);
204 
206  void BordersEndIndexing(string Path);
207 
209  void StartTextAreaBorders();
210 
212  void EndTextAreaBorders(DWORD TextAreaEndTokenNo);
213 
214 };
215 
218 
219 
220 #endif
221 
222 /*--- emacs style variables ---
223  * Local Variables:
224  * mode: C++
225  * c-file-style: "ellemtel"
226  * c-basic-offset: 4
227  * tab-width: 8
228  * indent-tabs-mode: nil
229  * End:
230  */
vector< DWORD > DwordVector
Definition: utilit.h:148
Definition: HitBorder.h:47
Definition: HitBorder.h:67
ddcBreakVector m_BreakOffsets
the breaks themselves
Definition: HitBorder.h:82
Definition: HitBorder.h:72
int m_FileBreakCollectionNo
a quick reference to file breaks (which are also stored in m_Breaks)
Definition: HitBorder.h:113
bool operator<(const CPageNumber &X) const
Definition: HitBorder.h:56
DWORD m_PageNumber
the page number itself (as it was mentioned in the source text)
Definition: HitBorder.h:51
map< string, int > m_ShortName2BreakCollection
the map from CBreakCollection.m_ShortName to the index in m_Breaks
Definition: HitBorder.h:107
string m_DefaultBreakName
The name of the default break collection (written in the options file)
Definition: HitBorder.h:115
map< string, int > m_LongName2BreakCollection
the map from CBreakCollection.m_LongName to the index in m_Breaks
Definition: HitBorder.h:110
Definition: morph_const.h:107
CTokenNo m_StartTokenNo
the starting position (in tokens) of the beginning of the page
Definition: HitBorder.h:49
ddcVecFile< CTokenNo > ddcBreakVector
Definition: HitBorder.h:43
vector< CBreakCollection > m_Breaks
all breaks
Definition: HitBorder.h:104
string m_ShortName
short name of this break collection
Definition: HitBorder.h:74
const vector< CBreakCollection > & GetBreaks(void) const
moo: get break collection map (dangerous)
Definition: HitBorder.h:157
DWORD CTokenNo
integer type CTokenNo is used to refer an index of a token in the corpus
Definition: ConcCommon.h:63
FILE * m_FileForIndexing
a file for temporally storing breaks during indexing
Definition: HitBorder.h:79
string m_LongName
long name of this break collection
Definition: HitBorder.h:76
uint32_t DWORD
Definition: utilit.h:105
ddcVecFile< CPageNumber > m_PageBreaks
page number collection
Definition: HitBorder.h:118
CPageNumber(CTokenNo StartTokenNo=0, DWORD PageNumber=0)
Definition: HitBorder.h:53
vector< DWORD > m_LastTextAreaBreaks
Definition: HitBorder.h:128