Mid-level scanner stage performs (optional) hyphenation normalization and text classification.
Public Types | |
Embedded Types | |
typedef std::vector< std::vector< std::vector< std::vector< std::vector< std::vector< std::string > > > > > > | wasteTagset |
typedef std::list< wasteLexerToken > | wasteLexerBuffer |
Public Member Functions | |
Constructors etc. | |
wasteLexer () | |
virtual | ~wasteLexer () |
Lexing functions | |
len | length_attr (size_t length) const |
void | set_token (mootToken &token, const wasteLexerToken &lex_token) |
void | buffer_token (const mootToken &stok) |
void | reset (void) |
low-level utilities | |
void | lexbuf_pop_front (void) |
Public Attributes | |
Low-level data | |
wasteTagset | wl_tagset |
int | wl_state |
wasteLexerBuffer | wl_lexbuf |
wasteLexerToken * | wl_current_tok |
wasteLexerToken * | wl_head_tok |
bool | wl_dehyph_mode |
Lexica | |
wasteLexicon | wl_stopwords |
wasteLexicon | wl_abbrevs |
wasteLexicon | wl_conjunctions |
token classification feature indices | |
enum | cls { stop = 0, rom = 1, alpha = 2, num = 3, dot = 4, comma = 5, colon = 6, scolon = 7, eos = 8, lbr = 9, rbr = 10, hyphen = 11, plus = 12, slash = 13, quote = 14, apos = 15, sc = 16, other = 17, n_cls = 18 } |
enum | cas { non = 0, lo = 1, up = 2, cap = 3, n_cas = 4 } |
enum | binary { uk = 0, kn = 1, n_binary = 2 } |
enum | len { le_null = 0, le_one = 1, le_three = 2, le_five = 3, longer = 4, n_len = 5 } |
static const int | n_hidden = 7 |
typedef std::vector<std::vector<std::vector<std::vector<std::vector<std::vector<std::string> > > > > > moot::wasteLexer::wasteTagset |
Multi-dimensional vector for constant access on feature bundles
typedef std::list<wasteLexerToken> moot::wasteLexer::wasteLexerBuffer |
List of wasteLexerToken for buffering while dehyphenating
moot::wasteLexer::wasteLexer | ( | ) |
Default constructor
|
virtual |
Destructor
|
inline |
Length to length attribute conversion
References mootTokenLexer::reset().
void moot::wasteLexer::set_token | ( | mootToken & | token, |
const wasteLexerToken & | lex_token | ||
) |
Set token features (token.tok_analyses) w.r.t. model features from source lex_token
void moot::wasteLexer::buffer_token | ( | const mootToken & | stok | ) |
copies stok to internal buffer. If wl_dehyph_mode is true, seeks and removes hyphenations.
void moot::wasteLexer::reset | ( | void | ) |
|
inline |
wrapper for wl_lexbuf.pop_front() which may clear wl_current_tok, wl_head_tok
|
static |
hidden features: s[01],S[01],w[01] (except s1,*,w0)
wasteTagset moot::wasteLexer::wl_tagset |
Token feature bundles
int moot::wasteLexer::wl_state |
Current state of the lexer
wasteLexerBuffer moot::wasteLexer::wl_lexbuf |
Buffer for dehyphenation: list of wasteLexerToken
wasteLexerToken* moot::wasteLexer::wl_current_tok |
current token under construction (NULL for none), pointer into wl_lexbuf
wasteLexerToken* moot::wasteLexer::wl_head_tok |
head of hyphenation sequence (NULL for none), pointer into wl_lexbuf
bool moot::wasteLexer::wl_dehyph_mode |
Dehyphenation switch
Referenced by moot::wasteLexerReader::dehyph_mode().
wasteLexicon moot::wasteLexer::wl_stopwords |
List of stopwords
wasteLexicon moot::wasteLexer::wl_abbrevs |
List of abbreviations
wasteLexicon moot::wasteLexer::wl_conjunctions |
List of conjunctions (for dehyphenating)