Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

mootTokenXmlDoc.h

Go to the documentation of this file.
00001 /* -*- Mode: C++ -*- */
00002 
00003 /*
00004    libmoot : moocow's part-of-speech tagging library
00005    Copyright (C) 2003-2004 by Bryan Jurish <moocow@ling.uni-potsdam.de>
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011    
00012    This library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016    
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with this library; if not, write to the Free Software
00019    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00020 */
00021 
00022 /* File: mootTokenXmlDoc.h
00023  * Author: Bryan Jurish <moocow@ling.uni-potsdam.de>
00024  * Description: libxml2 TokenIO layer
00025  */
00026 
00027 #ifndef MOOT_TOKEN_XML_DOC_IO_H
00028 
00029 #include <mootXmlDoc.h>
00030 #ifdef MOOT_LIBXML_ENABLED
00031 
00032 #include <mootTypes.h>
00033 #include <mootToken.h>
00034 #include <mootTokenIO.h>
00035 
00036 moot_BEGIN_NAMESPACE
00037 
00038 using namespace std;
00039 
00047 class TokenXmlDoc : public mootXmlDoc, public TokenReader, public TokenWriter
00048 {
00049 public:
00050   /*============================================================
00051    * TokenXmlDoc: Types
00052    *============================================================*/
00054   //typedef hash_map<void *,xmlNodePtr> Ptr2NodeMap;
00055 
00056 public:
00057   /*============================================================
00058    * TokenXmlDoc: Data: General
00059    *============================================================*/
00061   std::string srcname;
00062 
00063   /*============================================================
00064    * TokenXmlDoc: Data: Input
00065    *============================================================*/
00066 
00067   /*----------------------------------------------------*/
00071   const static int defaultParserOptions
00072       = (0
00073          | XML_PARSE_RECOVER  // recover on errors
00074          //| XML_PARSE_NOENT    // substitute entities
00075          //| XML_PARSE_DTDLOAD  // load external DTD
00076          //| XML_PARSE_DTDVALID // validate with the DTD
00077          //| XML_PARSE_NOBLANKS // trim "ignorable" whitespace
00078          //| XML_PARSE_NONET    // forbid network access
00079          | XML_PARSE_NOCDATA  // merge CDATA as text nodes
00080          );
00082 
00083 
00084   /*----------------------------------------------------*/
00093   mootXPathQuery  xpqSentence;
00094 
00101   mootXPathQuery  xpqToken;
00102 
00111   mootXPathQuery  xpqText;
00112 
00120   mootXPathQuery  xpqBesttag;
00121 
00131   mootXPathQuery  xpqAnalysis;
00132 
00140   mootXPathQuery  xpqPostag;
00142 
00143   /*----------------------------------------------------*/
00146   mootToken            curtok;   
00147   mootSentence         cursent;  
00148   mootToken::Analysis  curanal;  
00149 
00150   //Ptr2NodeMap          ptr2node; ///< maps sentence elements to their nodes
00152 
00153   /*============================================================
00154    * TokenXmlDoc: Data: Output
00155    *============================================================*/
00156 
00157   /*----------------------------------------------------*/
00165   std::string    outputRootName;
00167 
00168   /*----------------------------------------------------*/
00171   xmlNodePtr      outputNode;  
00172 
00173 
00174 public:
00175   /*============================================================
00176    * TokenXmlDoc: Methods
00177    *============================================================*/
00178 
00179   /*----------------------------------------------------*/
00182 
00184   TokenXmlDoc(int fmt=tiofWellDone)
00185     :
00186     //-- general
00187     srcname(""),
00188     //-- input search parameters
00189     xpqSentence("//s",true),
00190     xpqToken("./w",true),
00191     xpqText("./text/text()",true),
00192     xpqBesttag("./moot.tag/text()",true),
00193     xpqAnalysis(".//analysis",true),
00194     xpqPostag("./@pos",true),
00195     //
00196     //-- low-level input data
00197     //(empty)
00198     //
00199     //-- default output parameters
00200     outputRootName("doc"),
00201     outputNode(NULL)
00202   {
00203     //-- reader/writer format
00204     if (! (fmt&tiofXML) ) fmt |= tiofXML;
00205     tw_format = tr_format = fmt;
00206 
00207     //-- TokenReader pointers
00208     tr_token = &curtok;
00209     tr_sentence = &cursent;
00210 
00211     //-- xml options
00212     xml_options = defaultParserOptions;
00213     if (fmt & tiofPretty) {
00214       xml_options |= XML_PARSE_NOBLANKS;
00215       xml_format = true;
00216     }
00217   };
00218 
00220   virtual ~TokenXmlDoc(void)
00221   {
00222     //-- clear TokenReader pointers
00223     tr_token = NULL;
00224     tr_sentence = NULL;
00225   };
00226 
00227   /*----------------------------------------------------
00228    * mootTokenXmlDocIO: Reset
00229    */
00231   virtual void reset(void)
00232   {
00233     mootXmlDoc::reset();
00234     //ptr2node.clear();
00235   };
00237 
00238 
00239   /*----------------------------------------------------*/
00243     /*
00244   virtual void tr_clear(void)
00245   {
00246     cursent.clear();
00247     curtok.clear(); 
00248     curtok.toktype(TokTypeXML);
00249     curanal.clear();
00250   };
00251     */
00252 
00256   virtual void sourceName(const std::string &myname) { srcname = myname; };
00257 
00262   virtual void fromFile(FILE *file)
00263   {
00264     TokenReader::fromFile(file); //-- set tr_format, tr_source
00265     loadFile(file,NULL,NULL,srcname);
00266   };
00267 
00272   virtual void fromString(const char *s)
00273   {
00274     TokenReader::fromString(s); //-- set tr_format, tr_source
00275     loadBuffer(s, strlen(s), NULL, NULL, srcname);
00276   };
00277 
00281   virtual mootTokenType get_token(void);
00282 
00288   virtual mootTokenType get_sentence(void);
00289 
00291   virtual void carp(const char *fmt, ...);
00293 
00294 
00295   /*----------------------------------------------------*/
00298 
00306   virtual void toString(std::string &s)
00307   {
00308     TokenWriter::toString(s);
00309   };
00310 
00318   virtual void toFile(FILE *file)
00319   {
00320     tw_format |= tiofFile;
00321     tw_format &= ~tiofString;
00322     tw_sink = file;
00323   };
00324 
00339   virtual void put_token(const mootToken &token);
00340 
00345   virtual void put_sentence(const mootSentence &sentence);
00346 
00348   void put_token_local(const mootToken &token);
00349 
00351   void put_token_nonlocal(const mootToken &token);
00353 
00354 
00355   /*----------------------------------------------------*/
00368   virtual bool _post_load_hook(void);
00369 
00371   bool evalQuery(mootXPathQuery &query, xmlNodePtr xml_ctx=NULL);
00372 
00379   inline bool is_local_token(const mootToken &token) const
00380   {
00381     return (token.toktype() == TokTypeXML
00382             &&
00383             ((const xmlNodePtr)(token.user_data))->doc == xml_doc);
00384   };
00385 
00392   inline bool is_local_sentence(const mootSentence &sentence) const
00393   {
00394     for (mootSentence::const_iterator si = sentence.begin();
00395          si != sentence.end();
00396          si++)
00397       {
00398         if (!is_local_token(*si)) return false;
00399       }
00400     return true;
00401   };
00403 
00404 };
00405 
00406 moot_END_NAMESPACE
00407 
00408 #endif /* MOOT_LIBXML_ENABLED */
00409 
00410 #endif /* MOOT_TOKEN_XML_DOC_IO_H */

Generated on Mon Jun 27 13:05:25 2005 for libmoot by  doxygen 1.3.8-20040913