11 #ifndef __TEXTWOLF_XML_PATH_AUTOMATON_HPP__
12 #define __TEXTWOLF_XML_PATH_AUTOMATON_HPP__
31 template <
class CharSet_=
charset::UTF8>
63 static const char* name[ 7] = {
"Content",
"OpenTag",
"CloseTag",
"Attribute",
"ThisAttributeValue",
"AttributeValue",
"ContentStart"};
64 return name[ (
unsigned int)op];
81 Mask(
unsigned short p_pos=0,
unsigned short p_neg=0):
pos(p_pos),
neg(p_neg) {}
164 return "AttributeValue";
172 return "ThisAttributeValue";
178 return "ContentStart";
254 void defineKey(
unsigned int p_keysize,
const char* p_key,
const char* p_srckey)
270 for (ii=0; ii<
keysize; ii++)
key[ii]=p_key[ii];
274 for (ii=0; p_srckey[ii]!=0; ii++);
276 for (ii=0; p_srckey[ii]!=0; ii++)
srckey[ii]=p_srckey[ii];
288 void defineNext(
Operation op,
unsigned int p_keysize,
const char* p_key,
const char* p_srckey,
int p_next,
bool p_follow=
false)
320 std::ostringstream rt;
332 rt <<
" '" <<
srckey <<
"'";
354 std::ostringstream rt;
355 typename std::vector<State>::const_iterator ii=
states.begin(), ee=
states.end();
356 for (; ii != ee; ++ii)
358 rt << (int)(ii-
states.begin()) <<
": " << ii->tostring() << std::endl;
369 template <
class Buffer>
372 int si =
states[ stateidx].next;
375 if (
states[ si].core.typeidx &&
states[ si].core.mask.matches( e))
377 buf.push_back(
states[ si].core.typeidx);
442 int defineNext(
int stateidx,
Operation op,
unsigned int keysize,
const char* key,
const char* srckey,
bool follow=
false) throw(
exception,std::bad_alloc)
455 for (
int ee=stateidx; ee != -1; stateidx=ee,ee=
states[ee].link)
457 if ((
states[ee].key != 0) && (keysize ==
states[ee].keysize) && (
states[ee].core.follow == follow) && (mask ==
states[ee].core.mask))
460 for (ii=0; ii<keysize &&
states[ee].key[ii]==key[ii]; ii++);
461 if (ii == keysize)
return states[ee].next;
464 if (!
states[ stateidx].isempty())
466 while (
states[ stateidx].link >= 0)
468 stateidx =
states[ stateidx].link;
475 unsigned int lastidx =
states.size()-1;
476 states[ stateidx].defineNext( op, keysize, key, srckey, lastidx, follow);
477 return stateidx=lastidx;
479 catch (std::bad_alloc)
497 int defineOutput(
int stateidx,
const Mask& printOpMask,
int typeidx,
bool follow,
int start,
int end)
throw(exception,std::bad_alloc)
509 if (!
states[stateidx].isempty())
511 while (
states[ stateidx].link >= 0)
513 stateidx =
states[stateidx].link;
520 states[ stateidx].defineOutput( printOpMask, typeidx, follow, start, end);
523 catch (std::bad_alloc)
552 Range(
const Range& o) :start(o.start),end(o.end){}
556 Range(
int p_start,
int p_end) :start(p_start),end(p_end){}
559 Range(
int count) :start(0),end(count){}
561 Range() :start(0),end(-1){}
592 char* itr =
const_cast<char*
>(value);
594 if (!StaticXMLScanner::parseStaticToken( isTagCharMap, itr, pb))
598 stateidx = xs->defineNext( stateidx, op, pb.
size(), pb.
ptr(), value, follow);
602 stateidx = xs->defineNext( stateidx, op, 0, 0, 0, follow);
624 range = Range( p_start, p_end);
626 else if (p_end < range.end)
630 else if (p_start > range.start)
632 range.start = p_start;
642 return doRange( 0, p_count);
650 return doRange( p_start, std::numeric_limits<int>::max());
656 PathElement& push(
int typeidx)
throw(exception,std::bad_alloc)
658 if (xs != 0) stateidx = xs->defineOutput( stateidx, printOpMask, typeidx, follow, range.start, range.end);
664 PathElement() :xs(0),stateidx(0),follow(false),pushOpMask(0),printOpMask(0){}
671 PathElement(
const PathElement& orig) :xs(orig.xs),stateidx(orig.stateidx),range(orig.range),follow(orig.follow),pushOpMask(orig.pushOpMask),printOpMask(orig.printOpMask) {}
715 PathElement&
TO(
int idx)
throw(exception,std::bad_alloc) {
return doCount((idx>=0)?(idx+1):-1);}
719 PathElement&
FROM(
int idx)
throw(exception,std::bad_alloc) {
return doStart(idx);
return *
this;}
724 PathElement&
RANGE(
int idx1,
int idx2)
throw(exception,std::bad_alloc) {
return doRange(idx1,(idx2>=0)?(idx2+1):-1);
return *
this;}
728 PathElement&
INDEX(
int idx)
throw(exception,std::bad_alloc) {
return doRange(idx,idx+1);
return *
this;}
753 return PathElement(
this);
std::vector< State > states
Definition: xmlpathautomaton.hpp:349
PathElement & FROM(int idx)
Define minimum element index to push.
Definition: xmlpathautomaton.hpp:719
unsigned short neg
Definition: xmlpathautomaton.hpp:72
PathElement & selectTag(const char *name)
Find tag by name.
Definition: xmlpathautomaton.hpp:684
bool hasReject(XMLScannerBase::ElementType e) const
Definition: xmlpathautomaton.hpp:92
Scope(const Scope &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:425
~State()
Destructor.
Definition: xmlpathautomaton.hpp:240
bool operator==(const Mask &o)
Definition: xmlpathautomaton.hpp:98
Mask followMask
Definition: xmlpathautomaton.hpp:405
std::size_t size() const
Return the number of characters in the buffer.
Definition: staticbuffer.hpp:99
State()
Constructor.
Definition: xmlpathautomaton.hpp:228
Definition: xmlpathautomaton.hpp:50
uknown error
Definition: exception.hpp:26
PathElement & selectContent()
Define grab content.
Definition: xmlpathautomaton.hpp:746
Definition: xmlpathautomaton.hpp:53
[4] tag attribute value in the XML header
Definition: xmlscanner.hpp:179
PathElement(const PathElement &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:671
invalid string for a tag or attribute in the automaton definition. Usage error
Definition: exception.hpp:34
PathElement & operator[](const char *name)
Find tag by name.
Definition: xmlpathautomaton.hpp:680
ElementType
Enumeration of XML element types returned by an XML scanner.
Definition: xmlscanner.hpp:173
Base class for structures that can throw exceptions for non recoverable errors.
Definition: exception.hpp:20
Fixed size buffer fulfilling the requirement of a back insertion sequence needed for textwolf output...
Token(const Token &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:393
PathElement & operator--(int)
Corresponds to "//" in abbreviated syntax of XPath.
Definition: xmlpathautomaton.hpp:675
Tag scope definition.
Definition: xmlpathautomaton.hpp:402
std::string tostring() const
Definition: xmlpathautomaton.hpp:318
Mask mask
Definition: xmlpathautomaton.hpp:203
Core()
Constructor.
Definition: xmlpathautomaton.hpp:210
PathElement operator*()
Get automaton root element to start an XML path definition.
Definition: xmlpathautomaton.hpp:751
bool hasMatch(XMLScannerBase::ElementType e) const
Definition: xmlpathautomaton.hpp:96
PathElement & TO(int idx)
Define maximum element index to push.
Definition: xmlpathautomaton.hpp:715
void join(const Mask &mask)
Join two mask definitions.
Definition: xmlpathautomaton.hpp:188
int link
Definition: xmlpathautomaton.hpp:225
Token(const State &state, int p_stateidx)
Constructor by value.
Definition: xmlpathautomaton.hpp:397
void defineNext(Operation op, unsigned int p_keysize, const char *p_key, const char *p_srckey, int p_next, bool p_follow=false)
Define a state transition by key and operation.
Definition: xmlpathautomaton.hpp:288
void reject(XMLScannerBase::ElementType e)
Deactivate operation for a certain element type.
Definition: xmlpathautomaton.hpp:91
const char * ptr() const
Return the buffer content as 0-terminated string.
Definition: staticbuffer.hpp:103
PathElement & assignType(int type)
Define element type to push.
Definition: xmlpathautomaton.hpp:738
Simple back insertion sequence for storing the outputs of textwolf in a contant size buffer...
Definition: staticbuffer.hpp:24
unsigned int tokenidx_from
Definition: xmlpathautomaton.hpp:411
static const char * operationName(Operation op)
Get the name of the operation as string.
Definition: xmlpathautomaton.hpp:61
[3] tag attribute name in the XML header
Definition: xmlscanner.hpp:178
[10] open tag (e.g. "bla" for "<bla...")
Definition: xmlscanner.hpp:185
[13] content element string (separated by spaces or end of line)
Definition: xmlscanner.hpp:188
void defineKey(unsigned int p_keysize, const char *p_key, const char *p_srckey)
Define the matching key of this state.
Definition: xmlpathautomaton.hpp:254
int cnt_end
Definition: xmlpathautomaton.hpp:207
void match(XMLScannerBase::ElementType e)
Declare an operation to match on an element type.
Definition: xmlpathautomaton.hpp:95
void getEmmitedTokens(unsigned int stateidx, XMLScannerBase::ElementType e, Buffer &buf) const
Get the emmitted results for a successor state that match to an element of a type.
Definition: xmlpathautomaton.hpp:370
XMLPathSelectAutomaton()
Constructor.
Definition: xmlpathautomaton.hpp:36
parameter check in automaton definition failed. Internal textwolf error
Definition: exception.hpp:33
int next
Definition: xmlpathautomaton.hpp:224
Definition of unicode characters.
out of memory in the automaton definition. System error (std::bad_alloc)
Definition: exception.hpp:35
textwolf exception class
Definition: exception.hpp:48
int Hash
Definition: xmlpathautomaton.hpp:40
void seekop(Operation op)
Declare an operation as seek operation.
Definition: xmlpathautomaton.hpp:104
[9] tag attribute value (e.g. "5" in <person id='5'>
Definition: xmlscanner.hpp:184
Range(const Scope &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:419
Mask to query for element types, if they match or not.
Definition: xmlpathautomaton.hpp:69
char * srckey
Definition: xmlpathautomaton.hpp:223
unsigned int tokenidx_to
Definition: xmlpathautomaton.hpp:412
Defines the set of tag characters.
Definition: xmlscanner.hpp:367
Mask(unsigned short p_pos=0, unsigned short p_neg=0)
Constructor by values.
Definition: xmlpathautomaton.hpp:81
void defineOutput(const Mask &mask, int p_typeidx, bool p_follow, int p_start, int p_end)
Define an element output operation.
Definition: xmlpathautomaton.hpp:302
Operation
Definition: xmlpathautomaton.hpp:48
[11] close tag (e.g. "bla" for "</bla>")
Definition: xmlscanner.hpp:186
void defLink(int p_link)
Link another state to check to the current state.
Definition: xmlpathautomaton.hpp:313
Core(const Core &o)
Copy constructor.
Definition: xmlpathautomaton.hpp:213
Definition: xmlpathautomaton.hpp:54
PathElement & RANGE(int idx1, int idx2)
Define minimum and maximum element index to push.
Definition: xmlpathautomaton.hpp:724
[5] end of XML header event (after parsing '?>')
Definition: xmlscanner.hpp:180
Range on the token stack with all tokens that belong to this scope.
Definition: xmlpathautomaton.hpp:409
XMLPathSelectAutomaton< CharSet > ThisXMLPathSelectAutomaton
Definition: xmlpathautomaton.hpp:41
Defines one node in the XML Path element tree in the construction phase.
Definition: xmlpathautomaton.hpp:537
int typeidx
Definition: xmlpathautomaton.hpp:205
Scope()
Constructor.
Definition: xmlpathautomaton.hpp:430
XML scanner template that adds the functionality to the statemachine base definition.
Definition: xmlscanner.hpp:431
Core core
Definition: xmlpathautomaton.hpp:387
Mask(const Mask &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:85
PathElement & selectAttribute(const char *name)
Find tag with one attribute.
Definition: xmlpathautomaton.hpp:697
Definition of exceptions with containing error codes thrown by textwolf.
virtual ~XMLPathSelectAutomaton()
Definition: xmlpathautomaton.hpp:43
Mask mask
Definition: xmlpathautomaton.hpp:404
PathElement()
Constructor.
Definition: xmlpathautomaton.hpp:664
PathElement & operator()()
Define grab content.
Definition: xmlpathautomaton.hpp:743
PathElement(XMLPathSelectAutomaton *p_xs, int p_si=0)
Constructor by values.
Definition: xmlpathautomaton.hpp:668
unsigned int keysize
Definition: xmlpathautomaton.hpp:221
unsigned short pos
Definition: xmlpathautomaton.hpp:71
Active or passive but still valid token of the XML processing (this is a trigger waiting to match) ...
Definition: xmlpathautomaton.hpp:385
State(const State &orig)
Copy constructor.
Definition: xmlpathautomaton.hpp:233
char * key
Definition: xmlpathautomaton.hpp:222
bool empty() const
Tells if mask does not select anything anymore.
Definition: xmlpathautomaton.hpp:76
XML parser iterator interface for processing the XML elements one by one.
bool isempty()
Check it the state definition is empty.
Definition: xmlpathautomaton.hpp:248
bool rejects(XMLScannerBase::ElementType e) const
Check if an element type should reset a mask.
Definition: xmlpathautomaton.hpp:196
int cnt_start
Definition: xmlpathautomaton.hpp:206
const char * seekopName() const
Get the name of a seek operation.
Definition: xmlpathautomaton.hpp:145
Definition: xmlpathautomaton.hpp:55
CharSet_ CharSet
Definition: xmlpathautomaton.hpp:39
Scope & operator=(const Scope &orig)
Assignement operator.
Definition: xmlpathautomaton.hpp:428
Range range
Definition: xmlpathautomaton.hpp:421
[2] open XML header tag
Definition: xmlscanner.hpp:177
Range()
Constructor.
Definition: xmlpathautomaton.hpp:416
Definition: xmlpathautomaton.hpp:51
Core of an automaton state definition that is used during XML processing.
Definition: xmlpathautomaton.hpp:201
Automaton to define XML path expressions and assign types (int values) to them.
Definition: xmlpathautomaton.hpp:32
PathElement & ifAttribute(const char *name, const char *value)
Find tag with one attribute,value condition.
Definition: xmlpathautomaton.hpp:710
PathElement & INDEX(int idx)
Define index of the element index to push.
Definition: xmlpathautomaton.hpp:728
PathElement & selectCloseTag()
Find close tag of current tag selected.
Definition: xmlpathautomaton.hpp:687
State of an automaton in its definition.
Definition: xmlpathautomaton.hpp:218
int stateidx
Definition: xmlpathautomaton.hpp:388
unsigned int followidx
Definition: xmlpathautomaton.hpp:413
PathElement & operator=(int type)
Define element type to push.
Definition: xmlpathautomaton.hpp:734
bool matches(XMLScannerBase::ElementType e) const
Check if an element type matches the mask.
Definition: xmlpathautomaton.hpp:192
Token()
Constructor.
Definition: xmlpathautomaton.hpp:391
Core core
Definition: xmlpathautomaton.hpp:220
std::string tostring() const
Returns the content of the automaton as pretty printed string for debug output.
Definition: xmlpathautomaton.hpp:352
Definition: xmlpathautomaton.hpp:56
[8] tag attribute name (e.g. "id" in <person id='5'>
Definition: xmlscanner.hpp:183
void reset()
Reset operation (deactivate)
Definition: xmlpathautomaton.hpp:88
bool follow
Definition: xmlpathautomaton.hpp:204
Definition: xmlpathautomaton.hpp:52
[12] immediate close tag (e.g. "bla" for "<bla />")
Definition: xmlscanner.hpp:187
Character set encodings already implemented in textwolf.