mootTokenXmlDoc.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2005 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 /* File: mootTokenXmlDoc.h
23  * Author: Bryan Jurish <moocow@cpan.org>
24  * Description: libxml2 TokenIO layer
25  */
26 
32 #ifndef MOOT_TOKEN_XML_DOC_IO_H
33 
34 #include <mootXmlDoc.h>
35 #ifdef MOOT_LIBXML_ENABLED
36 
37 #warning "libxml2 support for mootToken is (and always has been) BROKEN: prefer expat"
38 
39 #include <mootTypes.h>
40 #include <mootToken.h>
41 #include <mootTokenIO.h>
42 
43 moot_BEGIN_NAMESPACE
44 
45 using namespace std;
46 
54 class TokenXmlDoc : public mootXmlDoc, public TokenReader, public TokenWriter
55 {
56 public:
57  /*============================================================
58  * TokenXmlDoc: Types
59  *============================================================*/
61  //typedef hash_map<void *,xmlNodePtr> Ptr2NodeMap;
62 
63 public:
64  /*============================================================
65  * TokenXmlDoc: Data: General
66  *============================================================*/
68  std::string srcname;
69 
70  /*============================================================
71  * TokenXmlDoc: Data: Input
72  *============================================================*/
73 
74  /*----------------------------------------------------*/
78  const static int defaultParserOptions
79  = (0
80  | XML_PARSE_RECOVER // recover on errors
81  //| XML_PARSE_NOENT // substitute entities
82  //| XML_PARSE_DTDLOAD // load external DTD
83  //| XML_PARSE_DTDVALID // validate with the DTD
84  //| XML_PARSE_NOBLANKS // trim "ignorable" whitespace
85  //| XML_PARSE_NONET // forbid network access
86  | XML_PARSE_NOCDATA // merge CDATA as text nodes
87  );
89 
90 
91  /*----------------------------------------------------*/
100  mootXPathQuery xpqSentence;
101 
108  mootXPathQuery xpqToken;
109 
118  mootXPathQuery xpqText;
119 
127  mootXPathQuery xpqBesttag;
128 
138  mootXPathQuery xpqAnalysis;
139 
147  mootXPathQuery xpqPostag;
149 
150  /*----------------------------------------------------*/
153  mootToken curtok;
154  mootSentence cursent;
155  mootToken::Analysis curanal;
156 
157  //Ptr2NodeMap ptr2node; ///< maps sentence elements to their nodes
159 
160  /*============================================================
161  * TokenXmlDoc: Data: Output
162  *============================================================*/
163 
164  /*----------------------------------------------------*/
172  std::string outputRootName;
174 
175  /*----------------------------------------------------*/
178  xmlNodePtr outputNode;
179 
180 
181 public:
182  /*============================================================
183  * TokenXmlDoc: Methods
184  *============================================================*/
185 
186  /*----------------------------------------------------*/
189 
191  TokenXmlDoc(int fmt=tiofWellDone)
192  :
193  //-- general
194  srcname(""),
195  //-- input search parameters
196  xpqSentence("//s",true),
197  xpqToken("./w",true),
198  xpqText("./text/text()",true),
199  xpqBesttag("./moot.tag/text()",true),
200  xpqAnalysis(".//analysis",true),
201  xpqPostag("./@pos",true),
202  //
203  //-- low-level input data
204  //(empty)
205  //
206  //-- default output parameters
207  outputRootName("doc"),
208  outputNode(NULL)
209  {
210  //-- reader/writer format
211  if (! (fmt&tiofXML) ) fmt |= tiofXML;
212  tw_format = tr_format = fmt;
213 
214  //-- TokenReader pointers
215  tr_token = &curtok;
216  tr_sentence = &cursent;
217 
218  //-- xml options
219  xml_options = defaultParserOptions;
220  if (fmt & tiofPretty) {
221  xml_options |= XML_PARSE_NOBLANKS;
222  xml_format = true;
223  }
224  };
225 
227  virtual ~TokenXmlDoc(void)
228  {
229  //-- clear TokenReader pointers
230  tr_token = NULL;
231  tr_sentence = NULL;
232  };
233 
234  /*----------------------------------------------------
235  * mootTokenXmlDocIO: Reset
236  */
238  virtual void reset(void)
239  {
240  mootXmlDoc::reset();
241  //ptr2node.clear();
242  };
244 
245 
246  /*----------------------------------------------------*/
250  /*
251  virtual void tr_clear(void)
252  {
253  cursent.clear();
254  curtok.clear();
255  curtok.toktype(TokTypeXML);
256  curanal.clear();
257  };
258  */
259 
263  virtual void sourceName(const std::string &myname) { srcname = myname; };
264 
269  virtual void fromFile(FILE *file)
270  {
271  TokenReader::fromFile(file); //-- set tr_format, tr_source
272  loadFile(file,NULL,NULL,srcname);
273  };
274 
279  virtual void fromString(const char *s)
280  {
281  TokenReader::fromString(s); //-- set tr_format, tr_source
282  loadBuffer(s, strlen(s), NULL, NULL, srcname);
283  };
284 
288  virtual mootTokenType get_token(void);
289 
295  virtual mootTokenType get_sentence(void);
296 
298  virtual void carp(const char *fmt, ...);
300 
301 
302  /*----------------------------------------------------*/
305 
313  virtual void toString(std::string &s)
314  {
315  TokenWriter::toString(s);
316  };
317 
325  virtual void toFile(FILE *file)
326  {
327  tw_format |= tiofFile;
328  tw_format &= ~tiofString;
329  tw_sink = file;
330  };
331 
346  virtual void put_token(const mootToken &token);
347 
352  virtual void put_sentence(const mootSentence &sentence);
353 
355  void put_token_local(const mootToken &token);
356 
358  void put_token_nonlocal(const mootToken &token);
360 
361 
362  /*----------------------------------------------------*/
375  virtual bool _post_load_hook(void);
376 
378  bool evalQuery(mootXPathQuery &query, xmlNodePtr xml_ctx=NULL);
379 
386  inline bool is_local_token(const mootToken &token) const
387  {
388  return (token.toktype() == TokTypeXML
389  &&
390  ((const xmlNodePtr)(token.user_data))->doc == xml_doc);
391  };
392 
399  inline bool is_local_sentence(const mootSentence &sentence) const
400  {
401  for (mootSentence::const_iterator si = sentence.begin();
402  si != sentence.end();
403  si++)
404  {
405  if (!is_local_token(*si)) return false;
406  }
407  return true;
408  };
410 
411 };
412 
413 moot_END_NAMESPACE
414 
415 #endif /* MOOT_LIBXML_ENABLED */
416 
417 #endif /* MOOT_TOKEN_XML_DOC_IO_H */
static const int tiofWellDone
Definition: mootTokenIO.h:79
mootTokenTypeE mootTokenType
Definition: mootToken.h:85
classes and utilities for tokens and associated analyses
Pretty-print (XML only)
Definition: mootTokenIO.h:56
XML format.
Definition: mootTokenIO.h:54
C++ wrapper class for libxml2 tree-mode XML documents (optional)
list< mootToken > mootSentence
Definition: mootToken.h:630
Abstract and native classes for I/O of moot::mootToken objects.
Common typedefs and constants.