mootRecode.h
Go to the documentation of this file.
1 /* -*- Mode: C++ -*- */
2 
3 /*
4  libmoot : moocow's part-of-speech tagging library
5  Copyright (C) 2003-2009 by Bryan Jurish <moocow@cpan.org>
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public
9  License as published by the Free Software Foundation; either
10  version 3 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13 
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  Lesser General Public License for more details.
17 
18  You should have received a copy of the GNU Lesser General Public
19  License along with this library; if not, write to the Free Software
20  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22 
23 /*--------------------------------------------------------------------------
24  * File: mootRecode.h
25  * Author: Bryan Jurish <moocow@cpan.org>
26  * Description:
27  * + moocow's PoS tagger : librecode interface
28  *--------------------------------------------------------------------------*/
29 
35 #ifndef _MOOT_RECODE_H
36 #define _MOOT_RECODE_H
37 
38 #include <mootIO.h>
39 
40 #ifdef MOOT_RECODE_ENABLED
41 #include <stdbool.h>
42 #include <recodext.h>
43 #endif // MOOT_RECODE_ENABLED
44 
45 
46 namespace moot {
47 
48 using namespace std;
49 
51 class mootRecoder {
52 public:
53  /*--------------------------------------------------------------
54  * DATA
55  */
56 #ifdef MOOT_RECODE_ENABLED
57  RECODE_OUTER rc_outer;
58  RECODE_REQUEST rc_request;
59 #else
60  void * rc_outer;
61  void * rc_request;
62 #endif // MOOT_RECODE_ENABLED
63  std::string rc_reqstr;
64 
65 
66 
67 public:
68  /*--------------------------------------------------------------
69  * METHODS
70  */
71  /*----------------------------------------------------*/
75  mootRecoder(const std::string &requestString="", bool diacritics_only=false)
76  : rc_outer(NULL), rc_request(NULL)
77  {
78  if (!requestString.empty()) scan_request(requestString,diacritics_only);
79  };
80 
82  mootRecoder(const std::string &src, const std::string &dst, bool diacritics_only=false)
83  : rc_outer(NULL), rc_request(NULL)
84  {
85  scan_request(src, dst, diacritics_only);
86  };
87 
89  ~mootRecoder(void)
90  {
91 #ifdef MOOT_RECODE_ENABLED
92  if (rc_request) recode_delete_request(rc_request);
93  if (rc_outer) recode_delete_outer(rc_outer);
94 #endif // MOOT_RECODE_ENABLED
95  };
96 
98  void ensure_outer(void) {
99 #ifdef MOOT_RECODE_ENABLED
100  if (!rc_outer) rc_outer = recode_new_outer(true); // (AUTO_ABORT)
101 #endif // MOOT_RECODE_ENABLED
102  };
104 
105  /*----------------------------------------------------*/
109  inline void scan_request(const std::string &reqstr, bool diacritics_only=false) {
110  ensure_outer();
111  rc_reqstr = reqstr;
112 #ifdef MOOT_RECODE_ENABLED
113  if (!rc_request) rc_request = recode_new_request(rc_outer);
114  assert(rc_request != NULL);
115  if (!reqstr.empty()) {
116  rc_request->diacritics_only = diacritics_only; // this MUST happen before scan_request() !
117  if (!recode_scan_request(rc_request, reqstr.c_str())) {
118  fprintf(stderr, "mootRecode::scan_request(): failed to scan request `%s'\n",
119  reqstr.c_str());
120  if (rc_request) {
121  recode_delete_request(rc_request);
122  rc_request = NULL;
123  }
124  }
125  return;
126  }
127  else if (rc_request) {
128  recode_delete_request(rc_request);
129  rc_request = NULL;
130  }
131 #else
132  fprintf(stderr, "mootRecode::scan_request(): librecode disabled! (request=\"%s\"\n",
133  reqstr.c_str());
134 #endif // MOOT_RECODE_ENABLED
135  };
136 
138  inline void scan_request(const std::string &src, const std::string &dst, bool diacritics_only=false)
139  {
140  if (src.empty() && dst.empty()) scan_request(src,diacritics_only);
141  else {
142  std::string reqstr = src;
143  reqstr.append("..");
144  reqstr.append(dst);
145  scan_request(reqstr,diacritics_only);
146  }
147  };
149 
150  /*----------------------------------------------------*/
154  inline bool string2file(const char *in, FILE *out)
155  {
156 #ifdef MOOT_RECODE_ENABLED
157  if (rc_request)
158  return recode_string_to_file(rc_request,in,out);
159 #endif
160  fputs(in,out);
161  return !ferror(out);
162  };
163 
165  inline bool buffer2file(const char *buf, size_t buflen, FILE *out)
166  {
167 #ifdef MOOT_RECODE_ENABLED
168  if (rc_request)
169  return recode_buffer_to_file(rc_request,buf,buflen,out);
170 #endif
171  fwrite(buf,buflen,1,out);
172  return !ferror(out);
173  };
174 
176  inline bool string2file(const std::string &in, FILE *out)
177  {
178 #ifdef MOOT_RECODE_ENABLED
179  if (rc_request)
180  return recode_buffer_to_file(rc_request,in.data(),in.size(),out);
181 #endif
182  fwrite(in.data(), in.size(), 1, out);
183  return !ferror(out);
184  };
186 
187  /*----------------------------------------------------*/
191  inline bool buffer2buffer(const char *in, size_t in_size, char **out, size_t *out_used, size_t *out_alloc)
192  {
193 #ifdef MOOT_RECODE_ENABLED
194  if (rc_request)
195  return recode_buffer_to_buffer(rc_request, in, in_size, out, out_used, out_alloc);
196 #endif
197  if (!*out) *out = reinterpret_cast<char *>(malloc(in_size));
198  else if (*out_alloc < in_size) {
199  free(*out);
200  *out = reinterpret_cast<char *>(malloc(in_size));
201  }
202  assert(*out != NULL);
203  *out_alloc = in_size;
204  memcpy(*out, in, in_size);
205  *out_used = in_size;
206  return true;
207  };
208 
210  inline bool string2buffer(const char *s, char **out, size_t *out_used, size_t *out_alloc)
211  {
212  return buffer2buffer(s,strlen(s), out,out_used,out_alloc);
213  };
214 
216  inline bool string2buffer(const std::string &in, char **out, size_t *out_used, size_t *out_alloc)
217  {
218  return buffer2buffer(in.data(),in.size(), out,out_used,out_alloc);
219  };
221 
222  /*----------------------------------------------------*/
226  inline bool buffer2string(const char *in, size_t in_size, std::string &out)
227  {
228  char *tmp = NULL;
229  size_t out_used = 0, out_alloc = 0;
230  bool rv = buffer2buffer(in,in_size, &tmp, &out_used, &out_alloc);
231  if (tmp) {
232  out.append(tmp, out_used);
233  free(tmp);
234  }
235  return rv;
236  }
237 
239  inline bool string2string(const char *s, std::string &out)
240  {
241  return buffer2string(s,strlen(s), out);
242  };
243 
245  inline bool string2string(const std::string &in, std::string &out)
246  {
247  return buffer2string(in.data(),in.size(), out);
248  };
249  /* ... etc. */
252  /*----------------------------------------------------*/
256  inline bool string2mstream(const char *in, mootio::mostream *out)
257  {
258  std::string s;
259  bool rv = string2string(in,s);
260  return out && out->puts(s) && rv;
261  };
262 
264  inline bool buffer2mstream(const char *buf, size_t buflen, mootio::mostream *out)
265  {
266  std::string s;
267  bool rv = buffer2string(buf,buflen,s);
268  return out && out->puts(s) && rv;
269  };
270 
272  inline bool string2mstream(const std::string &in, mootio::mostream *out)
273  {
274  std::string s;
275  bool rv = string2string(in,s);
276  return out && out->puts(s) && rv;
277  };
279 
280 }; //-- /class mootRecoder
281 
282 
287 class mootXMLRecoder {
288 public:
289  /*--------------------------------------------------------------
290  * DATA
291  */
292  bool standalone;
293 
294  mootRecoder rc1;
295  mootRecoder rc2;
296 
297  char *buf1;
298  size_t buf1_used;
299  size_t buf1_alloc;
300 
301  char *buf2;
302  size_t buf2_used;
303  size_t buf2_alloc;
304 
305 public:
306  /*--------------------------------------------------------------
307  * METHODS
308  */
309 
310  /*----------------------------------------------------*/
314  mootXMLRecoder(const std::string &src="", const std::string &dst="")
315  : buf1(NULL), buf1_used(0), buf1_alloc(0),
316  buf2(NULL), buf2_used(0), buf2_alloc(0)
317  {
318  //-- share outer context
319  rc1.ensure_outer();
320  rc2.rc_outer = rc1.rc_outer;
322  scan_request(src,dst);
323  };
327  {
328  rc2.rc_outer = NULL;
329  if (buf1) free(buf1);
330  if (buf2) free(buf2);
331  };
333 
334  /*----------------------------------------------------*/
338  inline void scan_request(const std::string &reqstr)
339  {
340  size_t dst_begin = reqstr.rfind("..");
341  string src(reqstr, 0, dst_begin);
342  string dst(reqstr, dst_begin);
343  standalone = (dst == "XML-standalone" || dst == "h0");
344  scan_request(src,dst);
345  };
346 
348  inline void scan_request(const std::string &src, const std::string &dst) {
349  standalone = (dst.empty() || dst == "XML-standalone" || dst == "h0");
350  if (standalone) {
351  if (src.empty()) {
352  rc1.scan_request("",false);
353  return;
354  }
355  rc1.scan_request(src,dst,false);
356  } else {
357  rc1.scan_request(src,"HTML_4.0",false);
358  rc2.scan_request("HTML_4.0",dst,true);
359  }
360  };
362 
363  /*----------------------------------------------------*/
367  inline bool buffer2file(const char *in, size_t in_size, FILE *out)
368  {
369  if (standalone) return rc1.buffer2file(in,in_size, out);
370  //-- temp bools are ugly, but librecode seems to be returning weird...
371  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
372  bool rv2 = rc2.buffer2file(buf1,buf1_used, out);
373  return rv1 && rv2;
374  };
375 
377  inline bool string2file(const char *in, FILE *out)
378  {
379  return buffer2file(in,strlen(in),out);
380  };
383  inline bool string2file(const std::string &in, FILE *out)
384  {
385  return buffer2file(in.data(),in.size(), out);
386  };
387 
388  /*----------------------------------------------------*/
392  inline bool buffer2buffer(const char *in, size_t in_size, char **out, size_t *out_used, size_t *out_alloc)
393  {
394  if (standalone) return rc1.buffer2buffer(in,in_size, out,out_used,out_alloc);
395  //-- temp bools are ugly, but librecode is returning weird...
396  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
397  bool rv2 = rc2.buffer2buffer(buf1,buf1_used, out,out_used,out_alloc);
398  return rv1 && rv2;
399  };
402  inline bool string2buffer(const char *s, char **out, size_t *out_used, size_t *out_alloc)
403  {
404  return buffer2buffer(s,strlen(s), out,out_used,out_alloc);
405  };
406 
408  inline bool string2buffer(const std::string &in, char **out, size_t *out_used, size_t *out_alloc)
409  {
410  return buffer2buffer(in.data(),in.size(), out,out_used,out_alloc);
411  };
413 
414  /*----------------------------------------------------*/
418  inline bool buffer2string(const char *in, size_t in_size, std::string &out)
419  {
420  if (standalone) {
421  //-- re-use internal buffer
422  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
423  if (buf1) out.append(buf1,buf1_used);
424  return rv1;
425  }
426  //-- temp bools are ugly, but librecode seems to be returning weird...
427  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
428  bool rv2 = rc2.buffer2string(buf1,buf1_used, out);
429  return rv1 && rv2;
430  };
431 
433  inline bool string2string(const char *s, std::string &out)
434  {
435  return buffer2string(s,strlen(s), out);
436  };
437 
439  inline bool string2string(const std::string &in, std::string &out)
440  {
441  return buffer2string(in.data(),in.size(), out);
442  };
443  /* ... etc. */
445 
446  /*----------------------------------------------------*/
450  inline bool buffer2mstream(const char *in, size_t in_size, mootio::mostream *out)
451  {
452  if (standalone) {
453  //-- re-use internal buffer1
454  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
455  if (buf1) return
456  out && (out->write(buf1,buf1_used) == buf1_used) && rv1;
457  }
458  //-- temp bools are ugly, but librecode seems to be returning weird...
459  bool rv1 = rc1.buffer2buffer(in,in_size, &buf1,&buf1_used,&buf1_alloc);
460  bool rv2 = rc2.buffer2buffer(buf1,buf1_used, &buf2,&buf2_used,&buf2_alloc);
461  return
462  out && (out->write(buf2,buf2_used) == buf2_used) && rv1 && rv2;
463  };
464 
466  inline bool string2mstream(const char *s, mootio::mostream *out)
467  {
468  return buffer2mstream(s,strlen(s), out);
469  };
470 
472  inline bool string2mstream(const std::string &in, mootio::mostream *out)
473  {
474  return buffer2mstream(in.data(),in.size(), out);
475  };
476  /* ... etc. */
478 
479 }; //-- /class mootXMLRecoder
480 
481 
482 
483 }; //-- /namespace moot
484 
485 #endif //_MOOT_RECODE_H
486 
Definition: mootAssocVector.h:39
virtual bool write(const char *buf, size_t n)
Definition: mootIO.h:218
Abstract base class for output stream wrappers.
Definition: mootIO.h:194
virtual bool puts(const char *s)
Definition: mootIO.h:227
Special 2-phase recoder object for XML text.
Definition: mootRecode.h:314
generic I/O abstraction layer
Interface to librecode character-conversion routines.
Definition: mootRecode.h:50