Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

mootTokenXmlDoc.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 2 of the License, or
00010    (at your option) any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License
00018    along with this program; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
00020 */
00021 
00022 /* File: mootTokenXmlDoc.h
00023  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00024  * Description: libxml2 TokenIO layer
00025  */
00026 
00027 #ifndef MOOT_TOKEN_XML_DOC_IO_H
00028 
00029 #include <mootXmlDoc.h>
00030 #ifdef MOOT_LIBXML_ENABLED
00031 
00032 #include <mootTypes.h>
00033 #include <mootToken.h>
00034 #include <mootTokenIO.h>
00035 
00036 moot_BEGIN_NAMESPACE
00037 
00038 using namespace std;
00039 
00047 class TokenXmlDoc : public mootXmlDoc, public TokenReader, public TokenWriter
00048 {
00049 public:
00050   /*============================================================
00051    * TokenXmlDoc: Types
00052    *============================================================*/
00054   //typedef hash_map<void *,xmlNodePtr> Ptr2NodeMap;
00055 
00056 public:
00057   /*============================================================
00058    * TokenXmlDoc: Data: General
00059    *============================================================*/
00061   std::string srcname;
00062 
00063   /*============================================================
00064    * TokenXmlDoc: Data: Input
00065    *============================================================*/
00066 
00067   /*----------------------------------------------------*/
00071   const static int defaultParserOptions
00072       = (0
00073          | XML_PARSE_RECOVER  // recover on errors
00074          //| XML_PARSE_NOENT    // substitute entities
00075          //| XML_PARSE_DTDLOAD  // load external DTD
00076          //| XML_PARSE_DTDVALID // validate with the DTD
00077          //| XML_PARSE_NOBLANKS // trim "ignorable" whitespace
00078          //| XML_PARSE_NONET    // forbid network access
00079          | XML_PARSE_NOCDATA  // merge CDATA as text nodes
00080          );
00082 
00083 
00084   /*----------------------------------------------------*/
00093   mootXPathQuery  xpqSentence;
00094 
00101   mootXPathQuery  xpqToken;
00102 
00111   mootXPathQuery  xpqText;
00112 
00120   mootXPathQuery  xpqBesttag;
00121 
00131   mootXPathQuery  xpqAnalysis;
00132 
00140   mootXPathQuery  xpqPostag;
00142 
00143   /*----------------------------------------------------*/
00146   mootToken            curtok;   
00147   mootSentence         cursent;  
00148   mootToken::Analysis  curanal;  
00149 
00150   //Ptr2NodeMap          ptr2node; ///< maps sentence elements to their nodes
00152 
00153   /*============================================================
00154    * TokenXmlDoc: Data: Output
00155    *============================================================*/
00156 
00157   /*----------------------------------------------------*/
00165   std::string    outputRootName;
00167 
00168   /*----------------------------------------------------*/
00171   xmlNodePtr      outputNode;  
00172 
00173 
00174 public:
00175   /*============================================================
00176    * TokenXmlDoc: Methods
00177    *============================================================*/
00178 
00179   /*----------------------------------------------------*/
00182 
00184   TokenXmlDoc(int fmt=tiofWellDone)
00185     :
00186     //-- general
00187     srcname(""),
00188     //-- input search parameters
00189     xpqSentence("//s",true),
00190     xpqToken("./w",true),
00191     xpqText("./text/text()",true),
00192     xpqBesttag("./moot.tag/text()",true),
00193     xpqAnalysis(".//analysis",true),
00194     xpqPostag("./@pos",true),
00195     //
00196     //-- low-level input data
00197     //(empty)
00198     //
00199     //-- default output parameters
00200     outputRootName("doc"),
00201     outputNode(NULL)
00202   {
00203     //-- reader/writer format
00204     if (! (fmt&tiofXML) ) fmt |= tiofXML;
00205     tw_format = tr_format = fmt;
00206 
00207     //-- TokenReader pointers
00208     tr_token = &curtok;
00209     tr_sentence = &cursent;
00210 
00211     //-- xml options
00212     xml_options = defaultParserOptions;
00213     if (fmt & tiofPretty) {
00214       xml_options |= XML_PARSE_NOBLANKS;
00215       xml_format = true;
00216     }
00217   };
00218 
00220   virtual ~TokenXmlDoc(void)
00221   {
00222     //-- clear TokenReader pointers
00223     tr_token = NULL;
00224     tr_sentence = NULL;
00225   };
00226 
00227   /*----------------------------------------------------
00228    * mootTokenXmlDocIO: Reset
00229    */
00231   virtual void reset(void)
00232   {
00233     mootXmlDoc::reset();
00234     //ptr2node.clear();
00235   };
00237 
00238 
00239   /*----------------------------------------------------*/
00243     /*
00244   virtual void tr_clear(void)
00245   {
00246     cursent.clear();
00247     curtok.clear(); 
00248     curtok.toktype(TokTypeXML);
00249     curanal.clear();
00250   };
00251     */
00252 
00256   virtual void sourceName(const std::string &myname) { srcname = myname; };
00257 
00262   virtual void fromFile(FILE *file)
00263   {
00264     TokenReader::fromFile(file); //-- set tr_format, tr_source
00265     loadFile(file,NULL,NULL,srcname);
00266   };
00267 
00272   virtual void fromString(const char *s)
00273   {
00274     TokenReader::fromString(s); //-- set tr_format, tr_source
00275     loadBuffer(s, strlen(s), NULL, NULL, srcname);
00276   };
00277 
00281   virtual mootTokenType get_token(void);
00282 
00288   virtual mootTokenType get_sentence(void);
00289 
00291   virtual void carp(const char *fmt, ...);
00293 
00294 
00295   /*----------------------------------------------------*/
00298 
00306   virtual void toString(std::string &s)
00307   {
00308     TokenWriter::toString(s);
00309   };
00310 
00318   virtual void toFile(FILE *file)
00319   {
00320     tw_format |= tiofFile;
00321     tw_format &= ~tiofString;
00322     tw_sink = file;
00323   };
00324 
00339   virtual void put_token(const mootToken &token);
00340 
00345   virtual void put_sentence(const mootSentence &sentence);
00346 
00348   void put_token_local(const mootToken &token);
00349 
00351   void put_token_nonlocal(const mootToken &token);
00353 
00354 
00355   /*----------------------------------------------------*/
00368   virtual bool _post_load_hook(void);
00369 
00371   bool evalQuery(mootXPathQuery &query, xmlNodePtr xml_ctx=NULL);
00372 
00379   inline bool is_local_token(const mootToken &token) const
00380   {
00381     return (token.toktype() == TokTypeXML
00382             &&
00383             ((const xmlNodePtr)(token.user_data))->doc == xml_doc);
00384   };
00385 
00392   inline bool is_local_sentence(const mootSentence &sentence) const
00393   {
00394     for (mootSentence::const_iterator si = sentence.begin();
00395          si != sentence.end();
00396          si++)
00397       {
00398         if (!is_local_token(*si)) return false;
00399       }
00400     return true;
00401   };
00403 
00404 };
00405 
00406 moot_END_NAMESPACE
00407 
00408 #endif /* MOOT_LIBXML_ENABLED */
00409 
00410 #endif /* MOOT_TOKEN_XML_DOC_IO_H */

Generated on Wed Jul 28 15:48:03 2004 for libmoot by doxygen1.2.15