wasteLexer.h
Go to the documentation of this file.
1 /* -*- Mode: C++; coding: utf-8; c-basic-offset: 4; -*- */
2 /*
3  libmoot : moot part-of-speech tagging library
4  Copyright (C) 2013-2016 by Bryan Jurish <moocow@cpan.org> and Kay-Michael Würzner
5 
6  This library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Lesser General Public
8  License as published by the Free Software Foundation; either
9  version 3 of the License, or (at your option) any later version.
10 
11  This library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public
17  License along with this library; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20 
35 #ifndef _WASTE_LEXER_H
36 #define _WASTE_LEXER_H
37 
38 #include <mootTokenIO.h>
39 #include <wasteTypes.h>
40 #include <wasteLexicon.h>
41 
42 #include <vector>
43 #include <list>
44 
45 namespace moot
46 {
47 
49  enum wasteLexer_state
50  {
51  ls_flush = 0x0001,
52  ls_hyph = 0x0002,
53  ls_head = 0x0004,
54  ls_tail = 0x0008,
55  ls_nl = 0x0010,
56  ls_sb_fw = 0x0020,
57  ls_wb_fw = 0x0040,
58  ls_blanked = 0x0080,
59  };
60  static const int ls_init = (ls_wb_fw | ls_sb_fw | ls_blanked);
61  static const int ls_head_hyph = ( ls_head | ls_hyph );
62  static const int ls_head_hyph_nl = ( ls_head_hyph | ls_nl );
64  /*============================================================================
65  * wasteLexerToken
66  */
67  class wasteLexerToken
68  {
69  public:
70  /*------------------------------------------------------------*/
73  mootToken wlt_token;
78  bool wlt_blanked;
79  bool s;
80  bool S;
81  bool w;
83 
84  /*--------------------------------------------------------------------
85  * wasteLexer: Methods
86  */
87  /*------------------------------------------------------------*/
91  wasteLexerToken(wasteLexerType type=wLexerTypeOther, bool blanked=true, bool bos=true, bool eos=false, bool bow=true)
92  : wlt_type(type),
93  wlt_blanked(blanked),
94  s(bos),
95  S(eos),
96  w(bow)
97  {}
99 
102 
104  inline void set_wlt_data(wasteLexerType lextype, bool blanked, bool s, bool S, bool w)
105  {
106  this->wlt_type = lextype;
107  this->wlt_blanked = blanked;
108  this->s = s;
109  this->S = S;
110  this->w = w;
111  }
113  };
114 
115  /*============================================================================
116  * wasteLexer
117  */
121  class wasteLexer
122  {
123  public:
124  //--------------------------------------------------------------------
126 
129  typedef std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<std::string> > > > > > wasteTagset;
130 
132  typedef std::list<wasteLexerToken> wasteLexerBuffer;
134 
135  //--------------------------------------------------------------------
137 
138 
140  enum cls
141  {
142  stop = 0,
143  rom = 1,
144  alpha = 2,
145  num = 3,
146  dot = 4,
147  comma = 5,
148  colon = 6,
149  scolon = 7,
150  eos = 8,
151  lbr = 9,
152  rbr = 10,
153  hyphen = 11,
154  plus = 12,
155  slash = 13,
156  quote = 14,
157  apos = 15,
158  sc = 16,
159  other = 17,
160  n_cls = 18
161  };
164  enum cas
165  {
166  non = 0,
167  lo = 1,
168  up = 2,
169  cap = 3,
170  n_cas = 4
171  };
172 
174  enum binary
175  {
176  uk = 0,
177  kn = 1,
178  n_binary = 2
179  };
182  enum len
183  {
184  le_null = 0,
185  le_one = 1,
186  le_three = 2,
187  le_five = 3,
188  longer = 4,
189  n_len = 5
190  };
193  static const int n_hidden = 7;
194 
196 
197  /*--------------------------------------------------------------------
198  * wasteLexer: data
199  */
200  /*------------------------------------------------------------*/
203  wasteTagset wl_tagset;
204  int wl_state;
205  wasteLexerBuffer wl_lexbuf;
206  wasteLexerToken *wl_current_tok;
207  wasteLexerToken *wl_head_tok;
208  bool wl_dehyph_mode;
210 
211  /*------------------------------------------------------------*/
214  wasteLexicon wl_stopwords;
215  wasteLexicon wl_abbrevs;
216  wasteLexicon wl_conjunctions;
218 
219  /*--------------------------------------------------------------------
220  * wasteLexer: Methods
221  */
222  /*------------------------------------------------------------*/
226  wasteLexer();
227 
229  virtual ~wasteLexer();
231 
232  /*------------------------------------------------------------*/
235 
237  inline len length_attr(size_t length) const
238  {
239  switch (length)
240  {
241  case 0:
242  return le_null;
243  case 1:
244  return le_one;
245  case 2:
246  case 3:
247  return le_three;
248  case 4:
249  case 5:
250  return le_five;
251  default:
252  return longer;
253  }
254  }
257  void set_token(mootToken &token, const wasteLexerToken &lex_token);
263  void buffer_token(const mootToken& stok);
265  void reset(void);
268  /*------------------------------------------------------------*/
277  inline void lexbuf_pop_front(void)
278  {
279  wasteLexerToken *front = &(wl_lexbuf.front());
280  if (wl_current_tok==front) wl_current_tok=NULL;
281  if (wl_head_tok==front) wl_head_tok=NULL;
282  wl_lexbuf.pop_front();
283  };
285  };
287  /*============================================================================
288  * wasteLexerReader
289  */
291  class wasteLexerReader : public TokenReader {
292  public:
293  /*----------------------------------------
294  * wasteLexerReader: Data
295  */
297  wasteLexer lexer;
298 
300  TokenReader *scanner;
301 
303  mootToken wlr_token;
306  mootSentence wlr_sentence;
308  public:
309  //------------------------------------------------------------
319  wasteLexerReader(int fmt =tiofText,
320  const std::string &name ="wasteLexerReader");
321 
323  virtual ~wasteLexerReader(void);
325 
326  /*------------------------------------------------------------
327  * TokenReader: Input Selection
328  */
332  void from_reader(TokenReader *reader);
333 
335  virtual void from_mstream(mootio::mistream *mistreamp);
336 
341  virtual void close(void);
342 
344  inline void dehyph_mode(bool on)
345  {
346  lexer.wl_dehyph_mode = on;
347  };
349 
350  /*------------------------------------------------------------
351  * TokenReader: Token-Level Access
352  */
359  virtual mootTokenType get_token(void);
360 
366  virtual mootTokenType get_sentence(void);
368 
369  /*------------------------------------------------------------
370  * TokenReader: Diagnostics
371  */
374 
376  virtual size_t line_number(void) { return scanner ? scanner->line_number() : 0; };
379  virtual size_t line_number(size_t n) { return scanner ? scanner->line_number(n) : 0; };
380 
382  virtual size_t column_number(void) { return scanner ? scanner->column_number() : 0; };
383 
385  virtual size_t column_number(size_t n) { return scanner ? scanner->column_number(n) : 0; };
386 
388  virtual mootio::ByteOffset byte_number(void) { return scanner ? scanner->byte_number() : 0; };
389 
391  virtual mootio::ByteOffset byte_number(size_t n) { return scanner ? scanner->byte_number(n) : 0; };
393  };
394 
396 } // namespace moot
397 
398 
399 #endif /* _WASTE_LEXER_H */
Definition: mootAssocVector.h:39
wasteLexerType wlt_type
Definition: wasteLexer.h:177
wasteLexer_state
Definition: wasteLexer.h:149
wasteLexerToken(wasteLexerType type=wLexerTypeOther, bool blanked=true, bool bos=true, bool eos=false, bool bow=true)
Definition: wasteLexer.h:191
bool S
Definition: wasteLexer.h:180
wasteLexerTypeE wasteLexerType
Definition: wasteTypes.h:101
virtual mootio::ByteOffset byte_number(void)
Definition: mootTokenIO.h:500
void set_wlt_data(wasteLexerType lextype, bool blanked, bool s, bool S, bool w)
Definition: wasteLexer.h:204
Definition: wasteLexer.h:155
Definition: wasteLexer.h:151
virtual size_t column_number(void)
Definition: mootTokenIO.h:494
Abstract class for token input.
Definition: mootTokenIO.h:208
Mid-level scanner stage performs (optional) hyphenation normalization and text classification.
Definition: wasteLexer.h:221
bool s
Definition: wasteLexer.h:179
Mid-level scanner stage, wraps moot::wasteLexer in moot::TokenReader API.
Definition: wasteLexer.h:395
Common definitions for WASTE HMM-based tokenizer.
static const int ls_head_hyph_nl
Definition: wasteLexer.h:162
High-level token information object.
Definition: mootToken.h:96
literal token text included
Definition: mootTokenIO.h:57
Definition: wasteLexer.h:167
Definition: wasteLexer.h:158
simple hash_set<>-based lexicon class
Definition: wasteLexicon.h:43
virtual size_t line_number(void)
Definition: mootTokenIO.h:488
bool w
Definition: wasteLexer.h:181
Definition: wasteLexer.h:156
moot::OffsetT ByteOffset
typedef for (byte) offsets (may be unsigned)
Definition: mootIO.h:55
list< mootToken > mootSentence
Definition: mootToken.h:630
mootTokenTypeE
Definition: mootToken.h:71
mootToken wlt_token
Definition: wasteLexer.h:173
Abstract and native classes for I/O of moot::mootToken objects.
static const int ls_init
Definition: wasteLexer.h:160
bool wlt_blanked
Definition: wasteLexer.h:178
Definition: wasteLexer.h:153
Definition: wasteLexer.h:157
len
Definition: wasteLexer.h:282
simple hash_set<>-based lexicon class for moot::wasteLexer
Abstract base class for input stream wrappers.
Definition: mootIO.h:129
static const int ls_head_hyph
Definition: wasteLexer.h:161
Definition: wasteTypes.h:97
Definition: wasteLexer.h:152
Definition: wasteLexer.h:154