TokenWriter wrapper class for writing WASTE tokenizer 'well-done' training data from pre-tokenized input with leading whitespace. More...
Public Member Functions | |
Constructors etc. | |
wasteTrainWriter (int fmt=tiofUnknown, const std::string &myname="wasteTrainer") | |
virtual | ~wasteTrainWriter () |
TokenWriter API: Output Selection | |
virtual void | to_mstream (mootio::mostream *mostreamp) |
virtual void | close (void) |
TokenWriter API: Token Stream Access | |
virtual void | put_token (const mootToken &token) |
virtual void | put_sentence (const mootSentence &sentence) |
virtual void | put_raw_buffer (const char *buf, size_t len) |
local methods | |
void | to_writer (TokenWriter *writer) |
void | flush_buffer (bool force=false) |
Public Member Functions inherited from moot::TokenWriter | |
TokenWriter (int fmt=tiofWellDone, const std::string &name="TokenWriter") | |
virtual | ~TokenWriter (void) |
virtual void | to_mstream (mootio::mostream &mos) |
virtual void | to_filename (const char *filename) |
virtual void | to_file (FILE *file) |
virtual void | to_fd (int fd) |
virtual void | to_cxxstream (std::ostream &os) |
virtual bool | opened (void) |
virtual bool | flush (void) |
bool | autoflush (mootio::mostream *os) |
virtual void | put_tokens (const mootSentence &tokens) |
virtual void | put_comment_block_begin (void) |
virtual void | put_comment_block_end (void) |
virtual void | put_comment_buffer (const char *buf, size_t len) |
virtual void | put_comment (const char *s) |
virtual void | put_comment_buffer (const std::string &s) |
virtual void | printf_comment (const char *fmt,...) |
virtual void | put_raw (const char *s) |
virtual void | put_raw (const std::string &s) |
virtual void | printf_raw (const char *fmt,...) |
virtual void | writer_name (const std::string &myname) |
virtual void | carp (const char *fmt,...) |
Static Public Member Functions | |
local static methods | |
static void | rtt_unescape (std::string &s) |
Static Public Member Functions inherited from moot::TokenIO | |
static int | parse_format_string (const std::string &fmtString) |
static int | guess_filename_format (const char *filename) |
static bool | is_empty_format (int fmt) |
static int | sanitize_format (int fmt, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static int | parse_format_request (const char *request, const char *filename=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static std::string | format_canonical_string (int fmt) |
static class TokenReader * | new_reader (int fmt) |
static class TokenWriter * | new_writer (int fmt) |
static class TokenReader * | file_reader (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static class TokenWriter * | file_writer (const char *filename, const char *fmt_request=__null, int fmt_implied=tiofNone, int fmt_default=tiofNone) |
static size_t | pipe_tokens (class TokenReader *reader, class TokenWriter *writer) |
static size_t | pipe_sentences (class TokenReader *reader, class TokenWriter *writer) |
Public Attributes | |
local data | |
wasteTokenScanner | wt_scanner |
wasteLexerReader | wt_lexer |
TokenWriter * | wt_writer |
mootSentence | wt_segbuf |
mootToken * | wt_pseg |
std::string | wt_txtbuf |
bool | wt_at_eos |
Public Attributes inherited from moot::TokenWriter | |
int | tw_format |
std::string | tw_name |
mootio::mostream * | tw_ostream |
bool | tw_ostream_created |
bool | tw_is_comment_block |
void * | tw_data |
Input tokens should contain leading whitespace where appropriate; "\n", "\r", "\t", "\f", "\v", "\ ", and "\\" are C-style escapes. Input comments of the form "$c=TEXT" are also treated as raw text. Token text of the form "RAW $= COOKED" will be bashed to RAW.
moot::wasteTrainWriter::wasteTrainWriter | ( | int | fmt = tiofUnknown , |
const std::string & | myname = "wasteTrainer" |
||
) |
Default constructor
|
virtual |
Destructor
|
static |
perform Lingua::TT::TextAlignment (*.rtt) style un-escaping in-place on s
|
virtual |
Select output to a mootio::mostream pointer; just wraps sink->to_mstream()
Reimplemented from moot::TokenWriter.
|
virtual |
Finish output to currently selected sink & perform any required cleanup operations.
Reimplemented from moot::TokenWriter.
|
virtual |
Write a single token to the currently selected output sink. Descendants must override this method.
Reimplemented from moot::TokenWriter.
Referenced by put_sentence().
|
inlinevirtual |
Write a single sentence to the currently selected output sink. Descendants may override this method. Default implementation just calls put_sentence().
Reimplemented from moot::TokenWriter.
References flush_buffer(), put_raw_buffer(), put_token(), moot::TokenWriter::put_tokens(), to_writer(), and moot::TokTypeEOS.
|
virtual |
Write some data to the currently selected output sink Descendants may override this method.
Reimplemented from moot::TokenWriter.
Referenced by put_sentence().
void moot::wasteTrainWriter::to_writer | ( | TokenWriter * | writer | ) |
Write "well-done" output to subordinate writer
Referenced by put_sentence().
void moot::wasteTrainWriter::flush_buffer | ( | bool | force = false | ) |
flush buffer to current output sink if defined
Referenced by put_sentence().
wasteTokenScanner moot::wasteTrainWriter::wt_scanner |
scanner for token-internalsegmentation
wasteLexerReader moot::wasteTrainWriter::wt_lexer |
lexer for classification
TokenWriter* moot::wasteTrainWriter::wt_writer |
subordinate writer, sink for "well-done" segments
mootSentence moot::wasteTrainWriter::wt_segbuf |
local segment buffer
mootToken* moot::wasteTrainWriter::wt_pseg |
last vanilla segment (for 'S' attribute), point into wt_buffer
std::string moot::wasteTrainWriter::wt_txtbuf |
token text buffer
bool moot::wasteTrainWriter::wt_at_eos |
whether we've seen an EOS and no vanilla token since