mootToken.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2014 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /*--------------------------------------------------------------------------
23  * File: mootToken.h
24  * Author: Bryan Jurish <moocow@cpan.org>
25  * Description:
26  * + moocow's PoS tagger : token information
27  *--------------------------------------------------------------------------*/
28 
34 #ifndef _moot_TOKEN_H
35 #define _moot_TOKEN_H
36 
37 #include <ctype.h>
38 
39 #include <list>
40 #include <vector>
41 #include <mootTypes.h>
42 
47 #define MOOT_TNT_COMPAT 1
48 //#undef MOOT_TNT_COMPAT
49 
54 #define MOOT_TOKEN_DEBUG 1
55 #undef MOOT_TOKEN_DEBUG
56 
57 #ifdef MOOT_TOKEN_DEBUG
58 # define TOKDEBUG(x) x
59 #else
60 # define TOKDEBUG(x)
61 #endif
62 
63 namespace moot {
64  using namespace std;
65 
66 /*----------------------------------------------------------------------
67  * Basic Types
68  *----------------------------------------------------------------------*/
69 
71 typedef string mootTagString;
72 
74 typedef string mootTokString;
75 
77 typedef set<mootTagString> mootTagSet;
78 
79 
80 /*----------------------------------------------------------------------
81  * Basic Token Types
82  *----------------------------------------------------------------------*/
84  /* Output token-types */
90  TokTypeEOS,
91  TokTypeEOF,
92  TokTypeWB,
93  TokTypeSB,
94  TokTypeUser,
95  NTokTypes
96 };
98 
100 extern const char* mootTokenTypeNames[NTokTypes];
101 
102 /*--------------------------------------------------------------------------
103  * mootToken
104  *--------------------------------------------------------------------------*/
108 class mootToken {
109 public:
110  /*---------------------------------------------------------------------
111  * Embedded Types
112  */
113 
115  typedef float Cost;
116 
118  class Analysis {
119  public:
121  //mootTokenType type;
122 
124  mootTagString tag;
125 
127  mootTagString details;
128 
130  ProbT prob;
131 
136  void *data;
137 
138  /*--------------------------------------------------
139  * Constructor / Destructor
140  */
142  Analysis(void)
143  : prob(0),
144  data(NULL)
145  {};
146 
148  Analysis(const mootTagString &my_tag
149  //, mootTokenType typ=TokTypeVanilla
150  )
151  : tag(my_tag),
152  prob(0),
153  data(NULL)
154  {};
155 
157  Analysis(const mootTagString &my_tag,
158  const mootTagString &my_details)
159  : tag(my_tag),
160  details(my_details),
161  prob(0),
162  data(NULL)
163  {};
164 
166  Analysis(const mootTagString &my_tag,
167  const mootTagString &my_details,
168  const ProbT my_prob)
169  : tag(my_tag),
170  details(my_details),
171  prob(my_prob),
172  data(NULL)
173  {};
174 
176  Analysis(const Analysis &x)
177  : tag(x.tag),
178  details(x.details),
179  prob(x.prob),
180  data(x.data)
181  {};
182 
184  inline void clear(void) {
185  tag.clear();
186  details.clear();
187  prob = 0;
188  };
189 
191  inline bool empty(void) const {
192  return tag.empty() && details.empty();
193  }
194 
196  friend bool operator<(const Analysis &x, const Analysis &y)
197  {
198  if (x.prob != y.prob) return x.prob < y.prob;
199  int tcomp = x.tag.compare(y.tag);
200  return (tcomp < 0
201  ? true
202  : (tcomp > 0
203  ? false
204  : x.details < y.details));
205  };
206 
208  friend bool operator==(const Analysis &x, const Analysis &y)
209  {
210  return x.prob == y.prob && x.tag == y.tag && x.details == y.details;
211  }
212 
214  void dump(const char *label=NULL, FILE *f=NULL) const;
215  }; //-- /mootToken::Analysis
216 
217 
219  //typedef set<Analysis> AnalysisSet;
220  //typedef vector<Analysis> Analyses;
221  typedef list<Analysis> Analyses;
222 
224  class Location {
225  public:
228  public:
230  inline Location(const OffsetT my_offset=0, const OffsetT my_length=0)
231  : offset(my_offset),
232  length(my_length)
233  {};
234 
236  inline Location(const Location &x)
237  : offset(x.offset),
238  length(x.length)
239  {};
240 
242  inline void clear(void)
243  {
244  offset = 0;
245  length = 0;
246  };
247  }; //-- /mootToken::Location
248 
249 public:
250  /*---------------------------------------------------------------------*
251  * Data Members
252  */
253 
257  mootTokenType tok_type;
258 
262  mootTokString tok_text;
263 
267  mootTagString tok_besttag;
268 
272  Analyses tok_analyses;
273 
278 
283  void *tok_data;
284 
285 public:
286  /*------------------------------------------------------------
287  * Constructors / Destructors
288  */
290  mootToken(mootTokenType type=TokTypeVanilla)
291  : tok_type(type),
292  tok_data(NULL)
293  {};
294 
296  mootToken(const mootTokString &text, mootTokenType type=TokTypeVanilla)
297  : tok_type(type),
298  tok_text(text),
299  tok_data(NULL)
300  {};
301 
303  mootToken(const mootTokString &text,
304  const Analyses &analyses)
305  : tok_type(TokTypeVanilla),
306  tok_text(text),
307  tok_analyses(analyses),
308  tok_data(NULL)
309  {};
310 
312  mootToken(const mootTokString &text,
313  const Analyses &analyses,
314  const mootTagString &besttag)
315  : tok_type(TokTypeVanilla),
316  tok_text(text),
317  tok_besttag(besttag),
318  tok_analyses(analyses),
319  tok_data(NULL)
320  {};
321 
323  mootToken(const mootToken& x)
324  : tok_type(x.tok_type),
325  tok_text(x.tok_text),
326  tok_besttag(x.tok_besttag),
327  tok_analyses(x.tok_analyses),
328  tok_location(x.tok_location),
329  tok_data(x.tok_data)
330  {};
333  /*
334  mootToken(mootTokenType type=TokTypeVanilla, const char *text, size_t len)
335  : tok_type(type),
336  tok_text(text,len),
337  tok_data(NULL)
338  {};
339  */
340 
341  /* Destructor */
342  ~mootToken(void) {
343 #if 0
344  if (tok_location.offset>=680307 && tok_location.offset<=680400) {
345  dump(); //-- debug
346  }
347 #endif
348  };
349 
350  /*------------------------------------------------------------
351  * Operators
352  */
354  friend bool operator==(const mootToken &x, const mootToken &y)
355  {
356  return
357  x.tok_type == y.tok_type
358  && x.tok_text == y.tok_text
359  && x.tok_besttag == y.tok_besttag
360  && x.tok_analyses == y.tok_analyses;
361  };
364  friend bool operator <(const mootToken &x, const mootToken &y)
365  {
366  return
367  x.tok_text < y.tok_text
368  || x.tok_besttag < y.tok_besttag
369  || x.tok_analyses < y.tok_analyses;
370  };
371 
373  inline mootToken& operator =(const mootToken &x)
374  {
375  tok_type = x.tok_type;
376  tok_text = x.tok_text;
377  tok_besttag = x.tok_besttag;
378  tok_analyses = x.tok_analyses;
379  tok_location = x.tok_location;
380  tok_data = x.tok_data;
381  return *this;
382  };
383 
384  /*------------------------------------------------------------
385  * Manipulators: General
386  */
388  inline void clear(void) {
389  tok_type = TokTypeVanilla;
390  tok_text.clear();
391  tok_besttag.clear();
392  tok_analyses.clear();
393  tok_location.clear();
394  };
395 
396  /*------------------------------------------------------------
397  * Manipulators: specific
398  */
400  inline const mootTokString &text(void) const {
401  return tok_text;
402  };
404  inline mootTokString &text(const mootTokString &text) {
405  tok_text = text;
406  return tok_text;
407  };
409  inline mootTokString &text(const char *s, size_t len) {
410  tok_text.assign(s,len);
411  return tok_text;
412  };
414  inline mootTokString &textAppend(const mootTokString &text) {
415  tok_text.append(text);
416  return tok_text;
417  };
419  inline mootTokString &textAppend(const char *s, size_t len) {
420  tok_text.append(s, len);
421  return tok_text;
422  };
423 
425  inline const mootTagString &besttag(void) const {
426  return tok_besttag;
427  };
429  inline mootTagString &besttag(const mootTagString &besttag) {
430  tok_besttag = besttag;
431  return tok_besttag;
432  };
434  inline mootTagString &besttagAppend(const mootTagString &text) {
435  tok_besttag.append(text);
436  return tok_besttag;
437  };
439  inline mootTagString &besttagAppend(const char *s, size_t len) {
440  tok_besttag.append(s, len);
441  return tok_besttag;
442  };
443 
445  inline mootTokenType toktype(void) const { return tok_type; }
447  inline mootTokenType toktype(const mootTokenType type) {
448  tok_type = type;
449  return tok_type;
450  };
451 
453  inline const Analyses &analyses(void) const { return tok_analyses; };
455  inline const Analyses &analyses(const Analyses &analyses) {
456  tok_analyses = analyses;
457  return tok_analyses;
458  };
460  inline void insert(const Analysis &analysis)
461  {
462  //tok_analyses.insert(analysis);
463  tok_analyses.push_back(analysis);
464  };
466  inline void insert(const mootTagString &tag, const mootTagString &details, ProbT p=0)
467  {
468  //insert(Analysis(tag,details));
469  tok_analyses.push_back(Analysis());
470  tok_analyses.back().tag = tag;
471  tok_analyses.back().details = details;
472  tok_analyses.back().prob = p;
473  };
475  inline void insert(const char *tag, const char *details, ProbT p=0)
476  {
477  //insert(Analysis(tag,details));
478  tok_analyses.push_back(Analysis());
479  tok_analyses.back().tag = tag;
480  tok_analyses.back().details = details;
481  tok_analyses.back().prob = p;
482  };
484  inline bool has_analysis_for_tag(const mootTagString &tag) const
485  {
486  for (Analyses::const_iterator asi = tok_analyses.begin();
487  asi != tok_analyses.end();
488  asi++)
489  {
490  if (asi->tag == tag) return true;
491  }
492  return false;
493  };
495  inline void erase(const Analysis &analysis)
496  {
497  for (Analyses::iterator asi = tok_analyses.begin();
498  asi != tok_analyses.end();
499  )
500  {
501  if (*asi == analysis) tok_analyses.erase(asi);
502  else asi++;
503  }
504  };
506  inline void prune(void)
507  {
508  for (Analyses::iterator asi = tok_analyses.begin();
509  asi != tok_analyses.end();
510  )
511  {
512  if (asi->tag != tok_besttag) tok_analyses.erase(asi);
513  else asi++;
514  }
515  };
516 
518  inline const Location &location(void) const
519  { return tok_location; }
520 
522  inline const Location &location(const Location &loc)
523  { tok_location=loc; return location(); }
524 
526  inline const Location &location(const OffsetT offset, const OffsetT length=0)
527  { return location(Location(offset,length)); };
528 
530  inline OffsetT loc_offset(void) const { return tok_location.offset; };
531 
533  inline OffsetT loc_offset(const OffsetT off)
534  { tok_location.offset=off; return loc_offset(); };
535 
537  inline OffsetT loc_length(void) const { return tok_location.length; };
540  inline OffsetT loc_length(const OffsetT len)
541  { tok_location.length=len; return loc_length(); };
542 
543 
544  /*------------------------------------------------------------
545  * Compatibility
546  */
553  inline void tokImport(const mootTokString *src_toktext=NULL,
554  const mootTagSet *src_tagset=NULL)
555  {
556  if (src_toktext) tok_text = *src_toktext;
557  if (src_tagset) {
558  for (mootTagSet::const_iterator tsi = src_tagset->begin();
559  tsi != src_tagset->end();
560  tsi++)
561  {
562  insert(Analysis(*tsi));
563  }
564  }
565  };
566 
577  inline void tokExport(mootTokString *dst_toktext=NULL,
578  mootTagSet *dst_tagset=NULL,
579  bool want_besttag_in_tagset = true) const
580  {
581  if (dst_toktext) *dst_toktext = tok_text;
582  if (dst_tagset) {
583  for (Analyses::const_iterator asi = tok_analyses.begin();
584  asi != tok_analyses.end();
585  asi++
586  //asi = upper_bound(asi->tag)
587  )
588  {
589  dst_tagset->insert(asi->tag);
590  }
591  if (want_besttag_in_tagset && !tok_besttag.empty())
592  dst_tagset->insert(tok_besttag);
593  }
594  };
595 
596  /*------------------------------------------------------------
597  * Debugging
598  */
600  void dump(const char *label=NULL, FILE *f=NULL) const;
601 
602 }; //-- /mootToken
603 
604 
605 /*--------------------------------------------------------------------------
606  * mootSentence
607  *--------------------------------------------------------------------------*/
608 
610 typedef list<mootToken> mootSentence;
611 
613 //typedef vector<mootToken> mootSentence;
614 
616 mootToken &sentence_printf_append(mootSentence &s, mootTokenType typ, const char *fmt, ...);
617 
618 /*----------------------------------------------------------------------
619  * Pattern-based Typification
620  * - obsolete; see mootFlavor.h
621  *----------------------------------------------------------------------*/
622 
623 }; /* namespace moot */
624 
625 #endif /* _moot_TOKEN_H */
Definition: mootAssocVector.h:39
Type for a single morphological analysis.
Definition: mootToken.h:106
Definition: mootToken.h:81
Definition: mootToken.h:224
mootTokenType tok_type
Definition: mootToken.h:257
Analyses tok_analyses
Definition: mootToken.h:272
mootToken & sentence_printf_append(mootSentence &s, mootTokenType typ, const char *fmt,...)
OffsetT length
(byte) length in input stream (default=0)
Definition: mootToken.h:227
void * data
Definition: mootToken.h:124
const char * mootTokenTypeNames[NTokTypes]
Definition: mootToken.h:76
Definition: mootToken.h:82
mootTokenTypeE mootTokenType
Definition: mootToken.h:85
OffsetT offset
(byte) offset in input stream (default=0)
Definition: mootToken.h:226
Definition: mootToken.h:83
Definition: mootToken.h:73
Location(const OffsetT my_offset=0, const OffsetT my_length=0)
Definition: mootToken.h:230
set< mootTagString > mootTagSet
Definition: mootToken.h:65
High-level token information object.
Definition: mootToken.h:96
float ProbT
Definition: mootTypes.h:63
Definition: mootToken.h:79
Location(const Location &x)
Definition: mootToken.h:236
mootTagString tag
Definition: mootToken.h:112
list< mootToken > mootSentence
Definition: mootToken.h:630
void * tok_data
Definition: mootToken.h:283
mootTokenTypeE
Definition: mootToken.h:71
ProbT prob
Definition: mootToken.h:118
Definition: mootToken.h:78
Definition: mootToken.h:75
float Cost
Definition: mootToken.h:103
string mootTagString
Definition: mootToken.h:59
const Location & location(void) const
Definition: mootToken.h:526
Common typedefs and constants.
Definition: mootToken.h:80
Definition: mootToken.h:77
mootTagString tok_besttag
Definition: mootToken.h:267
long unsigned int OffsetT
Definition: mootTypes.h:70
list< Analysis > Analyses
Definition: mootToken.h:221
Definition: mootToken.h:74
mootTokString tok_text
Definition: mootToken.h:262
mootTagString details
Definition: mootToken.h:115
Location tok_location
Definition: mootToken.h:277
mootToken(mootTokenType type=TokTypeVanilla)
Definition: mootToken.h:290
string mootTokString
Definition: mootToken.h:62
void clear(void)
Definition: mootToken.h:242