ddc
pcre_rml.h
Go to the documentation of this file.
1 //-*- Mode: C++ -*-
2 // DDC originally by Alexey Sokirko
3 // Changes and modifications 2011-2014 by Bryan Jurish
4 //
5 // This file is part of DDC.
6 //
7 // DDC is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU Lesser General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // DDC is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU Lesser General Public License for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with DDC. If not, see <http://www.gnu.org/licenses/>.
19 //
20 #ifndef _PCRE_RML_H
21 #define _PCRE_RML_H
22 
23 #include <pcrecpp.h>
24 #include "../CommonLib/utilit.h"
25 #include <iostream>
26 
27 extern void RmlPcreMakeTables(vector<BYTE>& character_table, MorphLanguageEnum Langua);
28 
29 // We convert user-passed pointers into special Arg objects
30 typedef pcrecpp::Arg Arg;
31 extern Arg no_arg;
32 
34  pcrecpp::RE_Options opts; // pcrecpp options
35  const BYTE* table; // table pointer to use for compilation (external)
36 };
37 
38 // Interface for regular expression matching. Also corresponds to a
39 // pre-compiled regular expression. An "RE" object is safe for
40 // concurrent use by multiple threads.
41 class RML_RE {
42 public:
43  typedef pcrecpp::StringPiece StringPiece;
44  typedef pcrecpp::RE_Options RE_Options;
45 
46  class Options {
47  public:
48  pcrecpp::RE_Options options_; // pcrecpp options
49  const BYTE* tableptr_; // table pointer to use for compilation (external)
50  Options(void) : tableptr_(NULL) {};
51  Options(const RE_Options &opts, const BYTE *tableptr=NULL)
52  : options_(opts), tableptr_(tableptr)
53  {
54  //-- moo: implicitly set PCRE_UCP whenever PCRE_UTF8 is requested: allows e.g. [[:upper:]] to match non-ASCII characters
55  if (opts.utf8())
56  options_.set_all_options( options_.all_options() | PCRE_UCP );
57  };
58  };
59 
60 protected:
61  string pattern_;
62  Options opts_; // regex options, table pointer
63  pcre* re_full_; // For full matches
64  pcre* re_partial_; // For partial matches
65  string error_; // Error indicator (empty for no error)
66  int match_limit_; // limit on execution resources
67 
68 public:
69  // OLD: actually used by DDC
70 
71  RML_RE(void)
72  : re_full_(NULL), re_partial_(NULL), match_limit_(0)
73  {};
74 
75  RML_RE(const string& pat)
76  : re_full_(NULL), re_partial_(NULL), match_limit_(0)
77  { Init(pat); };
78 
79  RML_RE(const string& pat, const Options &opts)
80  : re_full_(NULL), re_partial_(NULL), match_limit_(0), opts_(opts)
81  { Init(pat); };
82 
83  //-- used by legacy ddc code (e.g. by MorphoWizard::find_lemm in MorphWizardLib/wizard.cpp)
84  RML_RE(const string& pat, const vector<BYTE>& RegExpTables)
85  : re_full_(NULL), re_partial_(NULL), match_limit_(0)
86  {
87  opts_.tableptr_ = RegExpTables.data();
88  Init(pat);
89  };
90 
91  //-- moo: potentially dangerous low-level access
92  //inline Options& getOptions() { return opts_; };
93  //inline pcre* re_full() { return re_full_; };
94  //inline pcre* re_partial() { return re_partial_; };
95 
96  // copy constructor
97  RML_RE(const RML_RE& re)
98  : opts_(re.opts_), re_full_(NULL), re_partial_(NULL), match_limit_(0)
99  { Init(re.pattern_); };
100 
101  // assignment operator
102  void operator=(const RML_RE& re)
103  {
104  clear();
105  opts_ = re.opts_;
106  Init(re.pattern_);
107  };
108 
109  // clears pattern and compiled regexes; but leaves options and table pointer in place
110  void clear();
111 
112  ~RML_RE() { clear(); }
113 
114  // The string specification for this RE. E.g.
115  // RE re("ab*c?d+");
116  // re.pattern(); // "ab*c?d+"
117  const string& pattern() const { return pattern_; }
118 
119  // Re-compile the regex with a new pattern but same options and table pointer
120  void pattern(const string &pattern) { clear(); Init(pattern); }
121 
122  // If RE could not be created properly, returns an error string.
123  // Else returns the empty string.
124  const string& error() const { return error_; }
125 
126  /***** The useful part: the matching interface *****/
127 
128  // This is provided so one can do pattern.ReplaceAll() just as
129  // easily as ReplaceAll(pattern-text, ....)
130 
131 
132  bool FullMatch(const StringPiece& text,
133  const Arg& ptr1 = no_arg,
134  const Arg& ptr2 = no_arg,
135  const Arg& ptr3 = no_arg,
136  const Arg& ptr4 = no_arg,
137  const Arg& ptr5 = no_arg,
138  const Arg& ptr6 = no_arg,
139  const Arg& ptr7 = no_arg,
140  const Arg& ptr8 = no_arg,
141  const Arg& ptr9 = no_arg,
142  const Arg& ptr10 = no_arg,
143  const Arg& ptr11 = no_arg,
144  const Arg& ptr12 = no_arg,
145  const Arg& ptr13 = no_arg,
146  const Arg& ptr14 = no_arg,
147  const Arg& ptr15 = no_arg,
148  const Arg& ptr16 = no_arg) const;
149 
150  bool PartialMatch(const StringPiece& text,
151  const Arg& ptr1 = no_arg,
152  const Arg& ptr2 = no_arg,
153  const Arg& ptr3 = no_arg,
154  const Arg& ptr4 = no_arg,
155  const Arg& ptr5 = no_arg,
156  const Arg& ptr6 = no_arg,
157  const Arg& ptr7 = no_arg,
158  const Arg& ptr8 = no_arg,
159  const Arg& ptr9 = no_arg,
160  const Arg& ptr10 = no_arg,
161  const Arg& ptr11 = no_arg,
162  const Arg& ptr12 = no_arg,
163  const Arg& ptr13 = no_arg,
164  const Arg& ptr14 = no_arg,
165  const Arg& ptr15 = no_arg,
166  const Arg& ptr16 = no_arg) const;
167 
168  bool Consume(StringPiece* input,
169  const Arg& ptr1 = no_arg,
170  const Arg& ptr2 = no_arg,
171  const Arg& ptr3 = no_arg,
172  const Arg& ptr4 = no_arg,
173  const Arg& ptr5 = no_arg,
174  const Arg& ptr6 = no_arg,
175  const Arg& ptr7 = no_arg,
176  const Arg& ptr8 = no_arg,
177  const Arg& ptr9 = no_arg,
178  const Arg& ptr10 = no_arg,
179  const Arg& ptr11 = no_arg,
180  const Arg& ptr12 = no_arg,
181  const Arg& ptr13 = no_arg,
182  const Arg& ptr14 = no_arg,
183  const Arg& ptr15 = no_arg,
184  const Arg& ptr16 = no_arg) const;
185 
186  bool FindAndConsume(StringPiece* input,
187  const Arg& ptr1 = no_arg,
188  const Arg& ptr2 = no_arg,
189  const Arg& ptr3 = no_arg,
190  const Arg& ptr4 = no_arg,
191  const Arg& ptr5 = no_arg,
192  const Arg& ptr6 = no_arg,
193  const Arg& ptr7 = no_arg,
194  const Arg& ptr8 = no_arg,
195  const Arg& ptr9 = no_arg,
196  const Arg& ptr10 = no_arg,
197  const Arg& ptr11 = no_arg,
198  const Arg& ptr12 = no_arg,
199  const Arg& ptr13 = no_arg,
200  const Arg& ptr14 = no_arg,
201  const Arg& ptr15 = no_arg,
202  const Arg& ptr16 = no_arg) const;
203 
204  bool Replace(const StringPiece& rewrite, string *str) const;
205 
206  int GlobalReplace(const StringPiece& rewrite, string *str) const;
207 
208  bool Extract(const StringPiece &rewrite, const StringPiece &text, string *out) const;
209 
210 
211 
212  /***** Generic matching interface *****/
213 
214  // Type of match (TODO: Should be restructured as part of RE_Options)
215  enum Anchor {
216  UNANCHORED, // No anchoring
217  ANCHOR_START, // Anchor at start only
218  ANCHOR_BOTH // Anchor at start and end
219  };
220 
221  // General matching routine. Stores the length of the match in
222 
223  // "*consumed" if successful.
224  bool DoMatch(const StringPiece& text,
225  Anchor anchor,
226  int* consumed,
227  const Arg* const* args, int n) const;
228 
229  // Return the number of capturing subpatterns, or -1 if the
230  // regexp wasn't valid on construction.
231  int NumberOfCapturingGroups();
232 
233  // Returns true iff \c pat is a "complex" pattern
234  // \li formerly hard-coded into Init() method
235  // \li currently just checks for presence of '|'
236  static bool isComplexPattern(const std::string &pat);
237 
238 protected:
239  void Init(const string& pattern);
240  //void Init(const string& pattern, const RE_Options& options, const unsigned char *tableptr);
241 
242  // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
243  // pairs of integers for the beginning and end positions of matched
244  // text. The first pair corresponds to the entire matched text;
245  // subsequent pairs correspond, in order, to parentheses-captured
246  // matches. Returns the number of pairs (one more than the number of
247  // the last subpattern with a match) if matching was successful
248  // and zero if the match failed.
249  // I.e. for RE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
250  // against "foo", "bar", and "baz" respectively.
251  // When matching RE("(foo)|hello") against "hello", it will return 1.
252  // But the values for all subpattern are filled in into "vec".
253  int TryMatch(const StringPiece& text,
254  int startpos,
255  Anchor anchor,
256  int *vec,
257  int vecsize) const;
258 
259  // Append the "rewrite" string, with backslash subsitutions from "text"
260  // and "vec", to string "out".
261  bool Rewrite(string *out,
262  const StringPiece& rewrite,
263  const StringPiece& text,
264  int *vec,
265  int veclen) const;
266 
267  // internal implementation for DoMatch
268  bool DoMatchImpl(const StringPiece& text,
269  Anchor anchor,
270  int* consumed,
271  const Arg* const args[],
272  int n,
273  int* vec,
274  int vecsize) const;
275 
276  // Compile the regexp for the specified anchoring mode
277  //pcre* Compile(Anchor anchor, const unsigned char *tableptr);
278  pcre* Compile(Anchor anchor);
279 
280  // Don't allow the default copy or assignment constructors --
281  // they're expensive and too easy to do by accident.
282  // : moo: ...too effin bad, dude...
283  /*
284  RML_RE(const RML_RE&);
285  void operator=(const RML_RE&);
286  */
287 };
288 
289 
290 #endif
291 
292 /*--- emacs style variables ---
293  * Local Variables:
294  * mode: C++
295  * c-file-style: "ellemtel"
296  * c-basic-offset: 4
297  * tab-width: 8
298  * indent-tabs-mode: nil
299  * End:
300  */
void operator=(const RML_RE &re)
Definition: pcre_rml.h:102
Arg no_arg
pcre * re_full_
Definition: pcre_rml.h:63
Options opts_
Definition: pcre_rml.h:62
pcre * re_partial_
Definition: pcre_rml.h:64
Definition: pcre_rml.h:46
~RML_RE()
Definition: pcre_rml.h:112
void RmlPcreMakeTables(vector< BYTE > &character_table, MorphLanguageEnum Langua)
Definition: PCRE/pcre_rml.cpp:66
string error_
Definition: pcre_rml.h:65
Options(const RE_Options &opts, const BYTE *tableptr=NULL)
Definition: pcre_rml.h:51
RML_RE(void)
Definition: pcre_rml.h:71
string pattern_
Definition: pcre_rml.h:61
RML_RE(const string &pat, const Options &opts)
Definition: pcre_rml.h:79
Definition: pcre_rml.h:33
int match_limit_
Definition: pcre_rml.h:66
pcrecpp::StringPiece StringPiece
Definition: pcre_rml.h:43
void pattern(const string &pattern)
Definition: pcre_rml.h:120
pcrecpp::RE_Options opts
Definition: pcre_rml.h:34
const BYTE * table
Definition: pcre_rml.h:35
Definition: pcre_rml.h:217
RML_RE(const string &pat, const vector< BYTE > &RegExpTables)
Definition: pcre_rml.h:84
unsigned char BYTE
Definition: utilit.h:94
RML_RE(const string &pat)
Definition: pcre_rml.h:75
Options(void)
Definition: pcre_rml.h:50
Definition: pcre_rml.h:41
RML_RE(const RML_RE &re)
Definition: pcre_rml.h:97
pcrecpp::Arg Arg
Definition: pcre_rml.h:30
MorphLanguageEnum
Definition: utilit.h:162
const string & error() const
Definition: pcre_rml.h:124
const string & pattern() const
Definition: pcre_rml.h:117
pcrecpp::RE_Options RE_Options
Definition: pcre_rml.h:44
Definition: pcre_rml.h:216
const BYTE * tableptr_
Definition: pcre_rml.h:49
Anchor
Definition: pcre_rml.h:215
pcrecpp::RE_Options options_
Definition: pcre_rml.h:48