textwolf  0.2
xmlpathselect.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 
11 #ifndef __TEXTWOLF_XML_PATH_SELECT_HPP__
12 #define __TEXTWOLF_XML_PATH_SELECT_HPP__
13 #include "textwolf/char.hpp"
15 #include "textwolf/exception.hpp"
16 #include "textwolf/xmlscanner.hpp"
19 #include <limits>
20 #include <string>
21 #include <vector>
22 #include <map>
23 #include <cstddef>
24 
25 namespace textwolf {
26 
27 template <typename Element>
29  :public std::vector<Element>
30 {
31 public:
34  :std::vector<Element>(o){}
35 };
36 
40 template <class CharSet_, template <typename> class StackType_=DefaultStackType>
42 {
43 public:
46 
47 private:
48  const ThisXMLPathSelectAutomaton* atm; //< XML select automaton
49  typedef typename ThisXMLPathSelectAutomaton::Mask Mask;
51  typedef typename ThisXMLPathSelectAutomaton::Hash Hash;
54 
57  struct Context
58  {
59  XMLScannerBase::ElementType type; //< element type processed
60  const char* key; //< string value of element processed
61  unsigned int keysize; //< size of string value in bytes of element processed
62  Scope scope; //< active scope
63  unsigned int scope_iter; //< position of currently visited token in the active scope
64 
66  Context() :type(XMLScannerBase::Content),key(0),keysize(0) {}
67 
72  void init( XMLScannerBase::ElementType p_type, const char* p_key, int p_keysize)
73  {
74  type = p_type;
75  key = p_key;
76  keysize = p_keysize;
77  scope_iter = scope.range.tokenidx_from;
78  }
79  };
80 
81  StackType_<Scope> scopestk; //< stack of scopes opened
82  StackType_<unsigned int> follows; //< indices of tokens active in all descendant scopes
83  StackType_<int> triggers; //< triggered elements
84  StackType_<Token> tokens; //< list of waiting tokens
85  Context context; //< state variables without stacks of the automaton
86 
89  void expand( int stateidx)
90  {
91  while (stateidx!=-1)
92  {
93  const State& st = atm->states[ stateidx];
94  context.scope.mask.join( st.core.mask);
95  if (st.core.mask.empty() && st.core.typeidx != 0)
96  {
97  triggers.push_back( st.core.typeidx);
98  }
99  else
100  {
101  if (st.core.follow)
102  {
103  context.scope.followMask.join( st.core.mask);
104  follows.push_back( tokens.size());
105  }
106  tokens.push_back( Token( st, stateidx));
107  }
108  stateidx = st.link;
109  }
110  }
111 
116  void initProcessElement( XMLScannerBase::ElementType type, const char* key, int keysize)
117  {
118  if (context.type == XMLScannerBase::OpenTag)
119  {
120  //last step of open scope has to be done after all tokens were visited,
121  //e.g. with the next element initialization
122  context.scope.range.tokenidx_from = context.scope.range.tokenidx_to;
123  }
124  context.scope.range.tokenidx_to = tokens.size();
125  context.scope.range.followidx = follows.size();
126  context.init( type, key, keysize);
127  if (context.type == XMLScannerBase::OpenTag)
128  {
129  // first step of open scope saves the context context on stack
130  scopestk.push_back( context.scope);
131  context.scope.mask = context.scope.followMask;
132  context.scope.mask.match( XMLScannerBase::OpenTag);
133  //... we reset the mask but ensure that this 'OpenTag' is processed for sure
134  }
135  }
136 
137  void closeProcessElement()
138  {
139  if (context.type == XMLScannerBase::CloseTag || context.type == XMLScannerBase::CloseTagIm)
140  {
141  if (!scopestk.empty())
142  {
143  context.scope = scopestk.back();
144  scopestk.pop_back();
145  follows.resize( context.scope.range.followidx);
146  tokens.resize( context.scope.range.tokenidx_to);
147  }
148  }
149  }
150 
154  void produce( unsigned int tokenidx, const State& st)
155  {
156  const Token& tk = tokens[ tokenidx];
157  if (tk.core.cnt_end == -1)
158  {
159  expand( st.next);
160  }
161  else
162  {
163  if (tk.core.cnt_end > 0)
164  {
165  if (--tokens[ tokenidx].core.cnt_end == 0)
166  {
167  tokens[ tokenidx].core.mask.reset();
168  }
169  if (tk.core.cnt_start <= 0)
170  {
171  expand( st.next);
172  }
173  else
174  {
175  --tokens[ tokenidx].core.cnt_start;
176  }
177  }
178  }
179  }
180 
184  int match( unsigned int tokenidx)
185  {
186  int rt = 0;
187  if (context.key != 0)
188  {
189  if (tokenidx >= context.scope.range.tokenidx_to) return 0;
190 
191  Token* tk = &tokens[ tokenidx];
192  if (tk->core.mask.matches( context.type))
193  {
194  const State& st = atm->states[ tk->stateidx];
195  if (st.key)
196  {
197  if (st.keysize == context.keysize)
198  {
199  unsigned int ii;
200  for (ii=0; ii<context.keysize && st.key[ii] == context.key[ii]; ii++);
201  if (ii==context.keysize)
202  {
203  produce( tokenidx, st);
204  tk = &tokens[ tokenidx];
205  }
206  }
207  }
208  else
209  {
210  produce( tokenidx, st);
211  tk = &tokens[ tokenidx];
212  }
213  if (tk->core.typeidx != 0)
214  {
215  if (tk->core.cnt_end == -1)
216  {
217  rt = tk->core.typeidx;
218  }
219  else if (tk->core.cnt_end > 0)
220  {
221  if (--tk->core.cnt_end == 0)
222  {
223  tk->core.mask.reset();
224  }
225  if (tk->core.cnt_start <= 0)
226  {
227  rt = tk->core.typeidx;
228  }
229  else
230  {
231  --tk->core.cnt_start;
232  }
233  }
234  }
235  }
236  if (tk->core.mask.rejects( context.type))
237  {
238  //The token must not match anymore after encountering a reject item
239  tk->core.mask.reset();
240  }
241  }
242  return rt;
243  }
244 
247  int fetch()
248  {
249  int type = 0;
250 
251  if (context.scope.mask.matches( context.type))
252  {
253  while (!type)
254  {
255  if (context.scope_iter < context.scope.range.tokenidx_to)
256  {
257  type = match( context.scope_iter);
258  ++context.scope_iter;
259  }
260  else
261  {
262  unsigned int ii = context.scope_iter - context.scope.range.tokenidx_to;
263  //we match all follows that are not yet been checked in the current scope
264  if (ii < context.scope.range.followidx && context.scope.range.tokenidx_from > follows[ ii])
265  {
266  type = match( follows[ ii]);
267  ++context.scope_iter;
268  }
269  else if (!triggers.empty())
270  {
271  type = triggers.back();
272  triggers.pop_back();
273  }
274  else
275  {
276  context.key = 0;
277  context.keysize = 0;
278  return 0; //end of all candidates
279  }
280  }
281  }
282  }
283  else
284  {
285  context.key = 0;
286  context.keysize = 0;
287  }
288  return type;
289  }
290 
291 public:
298  template <class Buffer>
299  void getTokenTypeMatchingStates( XMLScannerBase::ElementType type, bool withFollows, Buffer& buf) const
300  {
301  unsigned int ti = context.scope.range.tokenidx_to, te = tokens.size();
302  for (; ti<te; ++ti)
303  {
304  const Token& tk = tokens[ (std::size_t)ti];
305  if (tk.core.mask.matches( type))
306  {
307  buf.push_back( tokens[ti].stateidx);
308  }
309  }
310  if (withFollows)
311  {
312  ti=0; te = context.scope.range.followidx;
313  for (; ti<te; ++ti)
314  {
315  if (tokens[ follows[ ti]].core.mask.matches( type))
316  {
317  buf.push_back( tokens[ follows[ ti]].stateidx);
318  }
319  }
320  }
321  }
322 
323 public:
327  :atm(p_atm),scopestk(),follows(),triggers(),tokens()
328  {
329  if (atm->states.size() > 0) expand(0);
330  }
331 
335  :atm(o.atm),scopestk(o.scopestk),follows(o.follows),triggers(o.triggers),tokens(o.tokens){}
336 
339  class iterator
340  {
341  public:
342  typedef int value_type;
343  typedef std::size_t difference_type;
344  typedef int* pointer;
345  typedef int& reference;
346  typedef std::input_iterator_tag iterator_category;
347 
348  private:
349  int element; //< currently visited element (type)
350  ThisXMLPathSelect* input; //< producing XML path selection stream
351 
354  iterator& skip() throw(exception,std::bad_alloc)
355  {
356  if (input != 0)
357  {
358  element = input->fetch();
359  }
360  else
361  {
362  element = 0;
363  }
364  return *this;
365  }
366 
370  bool compare( const iterator& iter) const
371  {
372  return (element == iter.element);
373  }
374 
375  public:
378  void assign( const iterator& orig)
379  {
380  input = orig.input;
381  element = orig.element;
382  }
383 
386  iterator( const iterator& orig)
387  {
388  assign( orig);
389  }
390 
396  iterator( ThisXMLPathSelect& p_input, XMLScannerBase::ElementType p_type, const char* p_key, int p_keysize)
397  :input( &p_input)
398  {
399  input->initProcessElement( p_type, p_key, p_keysize);
400  skip();
401  }
402 
404  {
405  if (input) input->closeProcessElement();
406  }
407 
410  :element(0),input(0) {}
411 
416  {
417  assign( orig);
418  return *this;
419  }
420 
423  int operator*() const
424  {
425  return element;
426  }
427 
430  const int* operator->() const
431  {
432  return &element;
433  }
434 
437  iterator& operator++() {return skip();}
438 
441  iterator operator++(int) {iterator tmp(*this); skip(); return tmp;}
442 
445  bool operator==( const iterator& iter) const {return compare( iter);}
446 
449  bool operator!=( const iterator& iter) const {return !compare( iter);}
450  };
451 
454  iterator push( XMLScannerBase::ElementType type, const char* key, int keysize)
455  {
456  return iterator( *this, type, key, keysize);
457  }
458 
461  iterator push( XMLScannerBase::ElementType type, const std::string& key)
462  {
463  return iterator( *this, type, key.c_str(), key.size());
464  }
465 
469  {
470  return iterator();
471  }
472 };
473 
474 }//namespace
475 #endif
Definition: xmlpathselect.hpp:28
std::vector< State > states
Definition: xmlpathautomaton.hpp:349
DefaultStackType()
Definition: xmlpathselect.hpp:32
ElementType
Enumeration of XML element types returned by an XML scanner.
Definition: xmlscanner.hpp:173
int & reference
Definition: xmlpathselect.hpp:345
Base class for structures that can throw exceptions for non recoverable errors.
Definition: exception.hpp:20
Fixed size buffer fulfilling the requirement of a back insertion sequence needed for textwolf output...
iterator(ThisXMLPathSelect &p_input, XMLScannerBase::ElementType p_type, const char *p_key, int p_keysize)
Constructor by values.
Definition: xmlpathselect.hpp:396
Tag scope definition.
Definition: xmlpathautomaton.hpp:402
const int * operator->() const
Element acceess.
Definition: xmlpathselect.hpp:430
Mask mask
Definition: xmlpathautomaton.hpp:203
int * pointer
Definition: xmlpathselect.hpp:344
int operator*() const
Element acceess.
Definition: xmlpathselect.hpp:423
[10] open tag (e.g. "bla" for "&lt;bla...")
Definition: xmlscanner.hpp:185
[13] content element string (separated by spaces or end of line)
Definition: xmlscanner.hpp:188
iterator operator++(int)
Postincrement.
Definition: xmlpathselect.hpp:441
~iterator()
Definition: xmlpathselect.hpp:403
bool operator==(const iterator &iter) const
Compare elements for equality.
Definition: xmlpathselect.hpp:445
iterator(const iterator &orig)
Copy constructor.
Definition: xmlpathselect.hpp:386
iterator & operator++()
Preincrement.
Definition: xmlpathselect.hpp:437
Definition of unicode characters.
textwolf exception class
Definition: exception.hpp:48
int Hash
Definition: xmlpathautomaton.hpp:40
iterator()
Default constructor.
Definition: xmlpathselect.hpp:409
iterator end()
Get the end of results returned by 'push(XMLScannerBase::ElementType,const char*, int)'...
Definition: xmlpathselect.hpp:468
Mask to query for element types, if they match or not.
Definition: xmlpathautomaton.hpp:69
void getTokenTypeMatchingStates(XMLScannerBase::ElementType type, bool withFollows, Buffer &buf) const
Get the next states states that match to an element of a type.
Definition: xmlpathselect.hpp:299
bool operator!=(const iterator &iter) const
Compare elements for inequality.
Definition: xmlpathselect.hpp:449
Interface that describes what a character set encoding implementation has to define to be used as cha...
void assign(const iterator &orig)
Assign iterator.
Definition: xmlpathselect.hpp:378
[11] close tag (e.g. "bla" for "&lt;/bla&gt;")
Definition: xmlscanner.hpp:186
XML path select template.
Definition: xmlpathselect.hpp:41
XMLPathSelect(const ThisXMLPathSelectAutomaton *p_atm)
Constructor.
Definition: xmlpathselect.hpp:326
iterator & operator=(const iterator &orig)
Assignement.
Definition: xmlpathselect.hpp:415
Core core
Definition: xmlpathautomaton.hpp:387
Definition of exceptions with containing error codes thrown by textwolf.
int value_type
Definition: xmlpathselect.hpp:342
XMLPathSelect< CharSet_, StackType_ > ThisXMLPathSelect
Definition: xmlpathselect.hpp:45
std::input_iterator_tag iterator_category
Definition: xmlpathselect.hpp:346
Active or passive but still valid token of the XML processing (this is a trigger waiting to match) ...
Definition: xmlpathautomaton.hpp:385
XML parser iterator interface for processing the XML elements one by one.
XMLPathSelectAutomaton< CharSet_ > ThisXMLPathSelectAutomaton
Definition: xmlpathselect.hpp:44
std::size_t difference_type
Definition: xmlpathselect.hpp:343
Automaton to select path expressions from an XML iterator.
iterator push(XMLScannerBase::ElementType type, const char *key, int keysize)
Feed the path selector with the next token and get the start iterator for the results.
Definition: xmlpathselect.hpp:454
input iterator for the output of this XMLScanner
Definition: xmlpathselect.hpp:339
XMLPathSelect(const XMLPathSelect &o)
Copy constructor.
Definition: xmlpathselect.hpp:334
Automaton to define XML path expressions and assign types (int values) to them.
Definition: xmlpathautomaton.hpp:32
iterator push(XMLScannerBase::ElementType type, const std::string &key)
Feed the path selector with the next token and get the start iterator for the results.
Definition: xmlpathselect.hpp:461
DefaultStackType(const DefaultStackType &o)
Definition: xmlpathselect.hpp:33
State of an automaton in its definition.
Definition: xmlpathautomaton.hpp:218
bool matches(XMLScannerBase::ElementType e) const
Check if an element type matches the mask.
Definition: xmlpathautomaton.hpp:192
[12] immediate close tag (e.g. "bla" for "&lt;bla /&gt;")
Definition: xmlscanner.hpp:187