ddc
UnitHolder.h
Go to the documentation of this file.
1 //
2 // This file is part of DDC.
3 //
4 // DDC is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Lesser General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // DDC is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Lesser General Public License for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
16 //
17 // ========== Dialing Graphematical Module (www.aot.ru)
18 // ========== Copyright by Alexey Sokirko (1996-2001), Bryan Jurish (2011)
19 
20 #ifndef __UNITSHOLDER_H_
21 #define __UNITSHOLDER_H_
22 
23 #include "graline.h"
24 
25 
27 {
28  vector<CGraLine> m_Units;
29  vector<char> m_TokenBuf;
30  vector<char> m_UnitBufUpper;
31  vector<BYTE> m_InputBuffer;
32 
33  map<size_t, short> m_FoundOborots;
34  map<size_t, DWORD> m_FoundPageBreaks;
35 public:
36  const vector<CGraLine>& GetUnits() const { return m_Units; };
37  CGraLine& GetUnit(size_t UnitNo);
38  const vector<char>& GetUnitBuf() const;
39  const vector<BYTE>& GetInputBuffer() const;
40 
42 
43  CUnitHolder();
44  // graphematical descriptors
45  void SetState (size_t LB, size_t HB, WORD state);
46  void SetDes(size_t x, Descriptors des);
47  bool HasDescr (size_t i, int descr) const { return (m_Units[i].GetDescriptors() & _QM (descr)) > 0; }
48 
49 
50  // group graphematical descriptors
51  bool AreGrouped (size_t LB, size_t HB) const;
52  bool HasGrouped (size_t LB, size_t HB) const;
53  bool HasAbbreviation (size_t LB, size_t HB) const;
54 
55 
56  // moving inside the graphematical table
57  size_t PassSpace (size_t i, size_t HB) const;
58  size_t PPunctOrSoft(size_t i, size_t HB) const;
59  size_t PPunct(size_t i, size_t HB) const;
60  size_t BSpace (size_t i, size_t LB = 0) const;
61  size_t PSoft (size_t i, size_t HB) const;
62  size_t BSoft (size_t i) const;
63 
64  // simple predicates
65  bool IsHyphen(size_t x) const;
66  bool is_latin_alpha (int ch) const;
67  bool is_lowercase (int ch) const;
68  bool is_uppercase (int ch) const;
69  bool IsOneAlpha(size_t x) const;
70  bool IsOneChar(size_t x, int i) const;
71  bool IsOneULet(size_t x) const;
72  bool FirstUpper(size_t x) const;
73  bool IsBulletWord (size_t x) const;
74  bool IsOneFullStop (size_t i) const;
75  bool EmptyLineBeforeGraph (size_t i, size_t HB) const;
76  bool IsQuestionOrExclamationMarks (size_t i) const;
77  bool IsSentenceEndMark (size_t i) const;
78  bool IsOneCloseQuotationMark (size_t i) const;
79  bool IsOneOpenQuotationMark (size_t i) const;
80 
81 
82  void FreeTable();
83  void BuildUnitBufferUpper ();
84  void InitTokenBuffer();
85  bool InitInputBuffer(const string& S);
86  void ClearInputBuffer();
87  void AddUnit(const CGraLine& NewLine);
88  const char* GetUnitBufferStart() const;
89  const char* GetUnitUpperBufferStart() const;
90  const char* GetUppercaseToken(DWORD LineNo) const;
91  string GetToken(DWORD LineNo) const;
92  size_t GetTokensCount() const;
93  DWORD GetTokenInputOffset(DWORD LineNo) const;
94  BYTE GetTokenLength(DWORD LineNo) const;
95  void DeleteDescr(size_t LineNo, Descriptors d);
96  void SetOborotNo(size_t LineNo, short OborotNo);
97  short GetOborotNo(size_t LineNo) const;
98  void SetPageNumber(size_t LineNo, DWORD PageNumber);
99  DWORD GetPageNumber(size_t LineNo) const;
100 };
101 
102 
103 
104 #endif
105 
106 /*--- emacs style variables ---
107  * Local Variables:
108  * mode: C++
109  * c-file-style: "ellemtel"
110  * c-basic-offset: 4
111  * tab-width: 8
112  * indent-tabs-mode: nil
113  * End:
114  */
const char * GetUppercaseToken(DWORD LineNo) const
Definition: UnitHolder.cpp:277
bool IsOneOpenQuotationMark(size_t i) const
Definition: UnitHolder.cpp:148
CGraLine & GetUnit(size_t UnitNo)
Definition: UnitHolder.cpp:348
vector< char > m_TokenBuf
Definition: UnitHolder.h:29
bool HasDescr(size_t i, int descr) const
Definition: UnitHolder.h:47
DWORD GetTokenInputOffset(DWORD LineNo) const
Definition: UnitHolder.cpp:297
const char * GetUnitBufferStart() const
Definition: UnitHolder.cpp:267
map< size_t, DWORD > m_FoundPageBreaks
Definition: UnitHolder.h:34
const vector< BYTE > & GetInputBuffer() const
Definition: UnitHolder.cpp:358
void SetOborotNo(size_t LineNo, short OborotNo)
Definition: UnitHolder.cpp:369
void SetPageNumber(size_t LineNo, DWORD PageNumber)
Definition: UnitHolder.cpp:387
bool IsBulletWord(size_t x) const
Definition: UnitHolder.cpp:213
bool InitInputBuffer(const string &S)
Definition: UnitHolder.cpp:327
Definition: graline.h:66
bool is_latin_alpha(int ch) const
Definition: UnitHolder.cpp:170
BYTE GetTokenLength(DWORD LineNo) const
Definition: UnitHolder.cpp:302
void BuildUnitBufferUpper()
Definition: UnitHolder.cpp:252
size_t PassSpace(size_t i, size_t HB) const
Definition: UnitHolder.cpp:69
bool AreGrouped(size_t LB, size_t HB) const
Definition: UnitHolder.cpp:41
short GetOborotNo(size_t LineNo) const
Definition: UnitHolder.cpp:377
bool IsOneAlpha(size_t x) const
Definition: UnitHolder.cpp:196
bool IsQuestionOrExclamationMarks(size_t i) const
Definition: UnitHolder.cpp:115
uint16_t WORD
Definition: utilit.h:106
Descriptors
Definition: gra_descr.h:26
bool EmptyLineBeforeGraph(size_t i, size_t HB) const
Definition: UnitHolder.cpp:226
void DeleteDescr(size_t LineNo, Descriptors d)
Definition: UnitHolder.cpp:342
DWORD GetPageNumber(size_t LineNo) const
Definition: UnitHolder.cpp:396
const char * GetUnitUpperBufferStart() const
Definition: UnitHolder.cpp:272
size_t GetTokensCount() const
Definition: UnitHolder.cpp:292
bool IsHyphen(size_t x) const
Definition: UnitHolder.cpp:164
map< size_t, short > m_FoundOborots
Definition: UnitHolder.h:33
#define _QM(X)
Definition: utilit.h:616
bool IsOneFullStop(size_t i) const
Definition: UnitHolder.cpp:218
size_t BSoft(size_t i) const
Definition: UnitHolder.cpp:107
size_t PPunct(size_t i, size_t HB) const
Definition: UnitHolder.cpp:83
bool IsOneULet(size_t x) const
bool HasGrouped(size_t LB, size_t HB) const
Definition: UnitHolder.cpp:51
vector< char > m_UnitBufUpper
Definition: UnitHolder.h:30
bool is_lowercase(int ch) const
Definition: UnitHolder.cpp:178
const vector< char > & GetUnitBuf() const
Definition: UnitHolder.cpp:353
void SetDes(size_t x, Descriptors des)
Definition: UnitHolder.cpp:158
size_t PSoft(size_t i, size_t HB) const
Definition: UnitHolder.cpp:98
Definition: UnitHolder.h:26
string GetToken(DWORD LineNo) const
Definition: UnitHolder.cpp:283
unsigned char BYTE
Definition: utilit.h:94
void FreeTable()
Definition: UnitHolder.cpp:243
MorphLanguageEnum
Definition: utilit.h:162
bool is_uppercase(int ch) const
Definition: UnitHolder.cpp:187
bool IsSentenceEndMark(size_t i) const
Definition: UnitHolder.cpp:125
size_t PPunctOrSoft(size_t i, size_t HB) const
Definition: UnitHolder.cpp:76
uint32_t DWORD
Definition: utilit.h:105
const vector< CGraLine > & GetUnits() const
Definition: UnitHolder.h:36
bool FirstUpper(size_t x) const
Definition: UnitHolder.cpp:208
CUnitHolder()
Definition: UnitHolder.cpp:28
void SetState(size_t LB, size_t HB, WORD state)
Definition: UnitHolder.cpp:35
void AddUnit(const CGraLine &NewLine)
Definition: UnitHolder.cpp:363
void ClearInputBuffer()
Definition: UnitHolder.cpp:337
void InitTokenBuffer()
Definition: UnitHolder.cpp:310
bool HasAbbreviation(size_t LB, size_t HB) const
Definition: UnitHolder.cpp:60
vector< BYTE > m_InputBuffer
Definition: UnitHolder.h:31
bool IsOneChar(size_t x, int i) const
Definition: UnitHolder.cpp:202
bool IsOneCloseQuotationMark(size_t i) const
Definition: UnitHolder.cpp:138
MorphLanguageEnum m_Language
Definition: UnitHolder.h:41
vector< CGraLine > m_Units
Definition: UnitHolder.h:28
size_t BSpace(size_t i, size_t LB=0) const
Definition: UnitHolder.cpp:91