textwolf  0.2
xmlscanner.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 
11 #ifndef __TEXTWOLF_XML_SCANNER_HPP__
12 #define __TEXTWOLF_XML_SCANNER_HPP__
13 #include "textwolf/char.hpp"
15 #include "textwolf/exception.hpp"
16 #include "textwolf/textscanner.hpp"
17 #include "textwolf/traits.hpp"
18 #include <map>
19 #include <cstddef>
20 
21 namespace textwolf {
22 
26 {
27 public:
28  enum
29  {
31  };
34  struct Element
35  {
37  int missError;
38 
41  struct Action
42  {
43  int op;
44  int arg;
45  };
47  unsigned char nofnext;
48  signed char next[ NofControlCharacter];
49 
52  {
53  action.op = -1;
54  action.arg = 0;
55  for (unsigned int ii=0; ii<NofControlCharacter; ii++) next[ii] = -1;
56  }
57  };
61  Element* get( int stateIdx) throw(exception)
62  {
63  if ((unsigned int)stateIdx>size) throw exception(InvalidState);
64  return tab + stateIdx;
65  }
66 
67 private:
68  Element tab[ MaxNofStates];
69  unsigned int size;
70 
73  void newState( int stateIdx) throw(exception)
74  {
75  if (size != (unsigned int)stateIdx) throw exception( StateNumbersNotAscending);
76  if (size >= MaxNofStates) throw exception( DimOutOfRange);
77  size++;
78  }
79 
82  void addOtherTransition( int nextState) throw(exception)
83  {
84  if (size == 0) throw exception( InvalidState);
85  if (nextState < 0 || nextState > MaxNofStates) throw exception( InvalidParamState);
86  for (unsigned int inputchr=0; inputchr<NofControlCharacter; inputchr++)
87  {
88  if (tab[ size-1].next[ inputchr] == -1) tab[ size-1].next[ inputchr] = (unsigned char)nextState;
89  }
90  tab[ size-1].nofnext = NofControlCharacter;
91  }
92 
96  void addTransition( ControlCharacter inputchr, int nextState) throw(exception)
97  {
98  if (size == 0) throw exception( InvalidState);
99  if ((int)inputchr >= (int)NofControlCharacter) throw exception( InvalidParamChar);
100  if (nextState < 0 || nextState > MaxNofStates) throw exception( InvalidParamState);
101  if (tab[ size-1].next[ inputchr] != -1) throw exception( DuplicateStateTransition);
102  tab[ size-1].next[ inputchr] = (unsigned char)nextState;
103  tab[ size-1].nofnext += 1;
104  }
105 
108  void addTransition( ControlCharacter inputchr) throw(exception)
109  {
110  addTransition( inputchr, size-1);
111  }
112 
116  void addAction( int action_op, int action_arg=0) throw(exception)
117  {
118  if (size == 0) throw exception( InvalidState);
119  if (tab[ size-1].action.op != -1) throw exception( InvalidState);
120  tab[ size-1].action.op = action_op;
121  tab[ size-1].action.arg = action_arg;
122  }
123 
126  void addMiss( int error) throw(exception)
127  {
128  if (size == 0) throw exception( InvalidState);
129  if (tab[ size-1].missError != -1) throw exception( InvalidState);
130  tab[ size-1].missError = error;
131  }
132 
135  void addFallback( int stateIdx) throw(exception)
136  {
137  if (size == 0) throw exception( InvalidState);
138  if (tab[ size-1].fallbackState != -1) throw exception( InvalidState);
139  if (stateIdx < 0 || stateIdx > MaxNofStates) throw exception( InvalidParamState);
140  tab[ size-1].fallbackState = stateIdx;
141  }
142 public:
144  ScannerStatemachine() :size(0){}
145 
147  ScannerStatemachine& operator[]( int stateIdx) {newState(stateIdx); return *this;}
149  ScannerStatemachine& operator()( ControlCharacter inputchr, int ns) {addTransition(inputchr,ns); return *this;}
151  ScannerStatemachine& operator()( ControlCharacter i1, ControlCharacter i2, int ns) {addTransition(i1,ns); addTransition(i2,ns); return *this;}
153  ScannerStatemachine& operator()( ControlCharacter i1, ControlCharacter i2, ControlCharacter i3, int ns) {addTransition(i1,ns); addTransition(i2,ns); addTransition(i3,ns); return *this;}
155  ScannerStatemachine& operator()( ControlCharacter inputchr) {addTransition(inputchr); return *this;}
157  ScannerStatemachine& action( int aa, int arg=0) {addAction(aa,arg); return *this;}
159  ScannerStatemachine& miss( int ee) {addMiss(ee); return *this;}
161  ScannerStatemachine& fallback( int stateIdx) {addFallback(stateIdx); return *this;}
163  ScannerStatemachine& other( int stateIdx) {addOtherTransition(stateIdx); return *this;}
164 };
165 
169 {
170 public:
174  {
190  };
191  enum
192  {
194  };
195 
199  static const char* getElementTypeName( ElementType ee)
200  {
201  static const char* names[ NofElementTypes] = {"None","ErrorOccurred","HeaderStart","HeaderAttribName","HeaderAttribValue","HeaderEnd", "DocAttribValue", "DocAttribEnd", "TagAttribName","TagAttribValue","OpenTag","CloseTag","CloseTagIm","Content","Exit"};
202  return names[ (unsigned int)ee];
203  }
204 
207  enum Error
208  {
209  Ok,
225  };
226 
230  static const char* getErrorString( Error ee)
231  {
232  enum {NofErrors=16};
233  static const char* sError[NofErrors]
234  = {0,"illegal document attribute definition",
235  "expected open tag",
236  "expected XML tag",
237  "unexpected end of text",
238  "syntax token",
239  "string not terminated",
240  "undefined character entity",
241  "expected tag end",
242  "expected equal",
243  "expected tag attribute",
244  "expected CDATA tag",
245  "internal (illegal state)",
246  "unexpected end of input",
247  "expected end of line",
248  "expected 2nd '-' to complete marker for start of comment '<!--'"
249  };
250  return sError[(unsigned int)ee];
251  }
252 
255  enum STMState
256  {
261  };
262 
266  static const char* getStateString( STMState s)
267  {
268  enum Constant {NofStates=48};
269  static const char* sState[NofStates]
270  = {
271  "START", "STARTTAG", "XTAG", "PITAG", "PITAGEND",
272  "XTAGEND", "XTAGDONE", "XTAGAISK", "XTAGANAM",
273  "XTAGAESK", "XTAGAVSK", "XTAGAVID", "XTAGAVSQ", "XTAGAVDQ",
274  "XTAGAVQE", "DOCSTART", "CONTENT", "TOKEN", "SEEKTOK", "XMLTAG",
275  "OPENTAG", "CLOSETAG", "TAGCLSK", "TAGAISK", "TAGANAM",
276  "TAGAESK", "TAGAVSK", "TAGAVID", "TAGAVSQ", "TAGAVDQ",
277  "TAGAVQE", "TAGCLIM", "ENTITYSL", "ENTITY", "ENTITYE",
278  "ENTITYID", "ENTITYSQ", "ENTITYDQ", "ENTITYLC",
279  "COMDASH2", "COMSEEKE", "COMENDD2", "COMENDCL",
280  "CDATA", "CDATA1", "CDATA2", "CDATA3", "EXIT"
281  };
282  return sState[(unsigned int)s];
283  }
284 
288  {
291  };
292 
296  static const char* getActionString( STMAction a)
297  {
298  static const char* name[ NofSTMActions] = {"Return", "ReturnWord", "ReturnContent", "ReturnIdentifier", "ReturnSQString", "ReturnDQString", "ExpectIdentifierXML", "ExpectIdentifierCDATA", "ReturnEOF"};
299  return name[ (unsigned int)a];
300  };
301 
305  {
308  {
309  (*this)
357  [ EXIT ].action(Return,Exit);
358  }
359  };
360 
364 
368  {
370  {
371  (*this)(Undef,true)(Any,true)(Dash,true);
372  }
373  };
374 
379  {
381  {
382  (*this)(Undef,true)(Equal,true)(Gt,true)(Slash,true)(Dash,true)(Exclam,true)(Questm,true)(Sq,true)(Dq,true)(Osb,true)(Csb,true)(Any,true);
383  }
384  };
385 
389  {
391  {
392  (*this)(Cntrl,true)(Space,true)(EndOfLine,true)(Undef,true)(Equal,true)(Gt,true)(Slash,true)(Dash,true)(Exclam,true)(Questm,true)(Sq,true)(Dq,true)(Osb,true)(Csb,true)(Any,true);
393  }
394  };
395 
399  {
401  {
402  (*this)(Sq,false)(Space,true);
403  }
404  };
405 
409  {
411  {
412  (*this)(Dq,false)(Space,true);
413  }
414  };
415 };
416 
417 
424 template
425 <
426  class InputIterator,
427  class InputCharSet_,
428  class OutputCharSet_,
429  class OutputBuffer_
430 >
432 {
433 private:
436  struct TokState
437  {
441  enum Id
442  {
443  Start,
444  ParsingDone,
445  ParsingKey,
446  ParsingEntity,
447  ParsingNumericEntity,
448  ParsingNumericBaseEntity,
449  ParsingNamedEntity,
450  ParsingToken
451  };
452  Id id;
453 
454  enum EolnState
455  {
456  SRC,CR
457  };
458  EolnState eolnState;
459 
460  unsigned int pos;
461  unsigned int base;
462  EChar value;
463  char buf[ 16];
464  UChar curchr_saved;
465 
467  TokState() :id(Start),eolnState(SRC),pos(0),base(0),value(0),curchr_saved(0) {}
468 
472  void init(Id id_=Start, EolnState eolnState_=SRC)
473  {
474  id=id_;eolnState=eolnState_;pos=0;base=0;value=0;curchr_saved=0;
475  }
476  };
477  TokState tokstate;
478 
479 public:
480  typedef InputCharSet_ InputCharSet;
481  typedef OutputCharSet_ OutputCharSet;
482  class iterator;
483 
484 public:
487  typedef std::map<const char*,UChar> EntityMap;
488  typedef OutputBuffer_ OutputBuffer;
489 
490 private:
493  void push( UChar ch)
494  {
495  m_output.print( ch, m_outputBuf);
496  }
497 
498  void copychar_impl( const traits::TypeCheck::YES&)
499  {
500  m_src.copychar( m_output, m_outputBuf);
501  }
502 
503  void copychar_impl( const traits::TypeCheck::NO&)
504  {
505  push( m_src.chr());
506  }
507 
508  void copychar()
509  {
510  copychar_impl( traits::TypeCheck::is_same<InputCharSet,OutputCharSet>::type());
511  }
512 
515  static unsigned char HEX( unsigned char ch)
516  {
517  struct HexCharMap :public CharMap<unsigned char, 0xFF>
518  {
519  HexCharMap()
520  {
521  (*this)
522  ('0',0) ('1', 1)('2', 2)('3', 3)('4', 4)('5', 5)('6', 6)('7', 7)('8', 8)('9', 9)
523  ('A',10)('B',11)('C',12)('D',13)('E',14)('F',15)('a',10)('b',11)('c',12)('d',13)('e',14)('f',15);
524  }
525  };
526  static HexCharMap hexCharMap;
527  return hexCharMap[ch];
528  }
529 
533  static UChar parseStaticNumericEntityValue( InputReader& ir)
534  {
535  EChar value = 0;
536  unsigned char ch = ir.ascii();
537  unsigned int base;
538  if (ch != '#') return 0;
539  ir.skip();
540  ch = ir.ascii();
541  if (ch == 'x')
542  {
543  ir.skip();
544  ch = ir.ascii();
545  base = 16;
546  }
547  else
548  {
549  base = 10;
550  }
551  while (ch != ';')
552  {
553  unsigned char chval = HEX(ch);
554  if (value >= base) return 0;
555  value = value * base + chval;
556  if (value >= 0xFFFFFFFF) return 0;
557  ir.skip();
558  ch = ir.ascii();
559  }
560  return (UChar)value;
561  }
562 
565  void fallbackEntity()
566  {
567  switch (tokstate.id)
568  {
569  case TokState::Start:
570  case TokState::ParsingDone:
571  case TokState::ParsingKey:
572  case TokState::ParsingToken:
573  break;
574  case TokState::ParsingEntity:
575  push('&');
576  break;
577  case TokState::ParsingNumericEntity:
578  push('&');
579  push('#');
580  break;
581  case TokState::ParsingNumericBaseEntity:
582  push('&');
583  push('#');
584  for (unsigned int ii=0; ii<tokstate.pos; ii++) push( tokstate.buf[ii]);
585  break;
586  case TokState::ParsingNamedEntity:
587  push('&');
588  for (unsigned int ii=0; ii<tokstate.pos; ii++) push( tokstate.buf[ii]);
589  break;
590  }
591  }
592 
595  bool parseEntity()
596  {
597  unsigned char ch;
598  tokstate.id = TokState::ParsingEntity;
599  ch = m_src.ascii();
600  if (ch == '#')
601  {
602  m_src.skip();
603  return parseNumericEntity();
604  }
605  else
606  {
607  return parseNamedEntity();
608  }
609  }
610 
613  bool parseNumericEntity()
614  {
615  unsigned char ch;
616  tokstate.id = TokState::ParsingNumericEntity;
617  ch = m_src.ascii();
618  if (ch == 'x')
619  {
620  tokstate.base = 16;
621  m_src.skip();
622  return parseNumericBaseEntity();
623  }
624  else
625  {
626  tokstate.base = 10;
627  return parseNumericBaseEntity();
628  }
629  }
630 
633  bool parseNumericBaseEntity()
634  {
635  unsigned char ch;
636  tokstate.id = TokState::ParsingNumericBaseEntity;
637 
638  while (tokstate.pos < sizeof(tokstate.buf))
639  {
640  ch = m_src.ascii();
641  if (ch == ';')
642  {
643  if (tokstate.value > 0xFFFFFFFF)
644  {
645  tokstate.buf[ tokstate.pos++] = ch;
646  fallbackEntity();
647  return true;
648  }
649  push( (UChar)tokstate.value);
650  tokstate.init( TokState::ParsingToken);
651  m_src.skip();
652  return true;
653  }
654  else
655  {
656  unsigned char chval = HEX(ch);
657  if (chval >= tokstate.base)
658  {
659  fallbackEntity();
660  return true;
661  }
662  tokstate.buf[ tokstate.pos++] = ch;
663  tokstate.value = tokstate.value * tokstate.base + chval;
664  m_src.skip();
665  }
666  }
667  fallbackEntity();
668  return true;
669  }
670 
673  bool parseNamedEntity()
674  {
675  unsigned char ch;
676  tokstate.id = TokState::ParsingNamedEntity;
677  ch = m_src.ascii();
678  while (tokstate.pos < sizeof(tokstate.buf)-1 && ch != ';' && m_src.control() == Any)
679  {
680  tokstate.buf[ tokstate.pos] = ch;
681  m_src.skip();
682  tokstate.pos++;
683  ch = m_src.ascii();
684  }
685  if (ch == ';')
686  {
687  tokstate.buf[ tokstate.pos] = '\0';
688  if (!pushEntity( tokstate.buf)) return false;
689  tokstate.init( TokState::ParsingToken);
690  m_src.skip();
691  return true;
692  }
693  else
694  {
695  fallbackEntity();
696  return true;
697  }
698  }
699 
702  bool parseTokenRecover()
703  {
704  bool rt = false;
705  if (tokstate.curchr_saved)
706  {
707  push( tokstate.curchr_saved);
708  tokstate.curchr_saved = 0;
709  }
710  switch (tokstate.id)
711  {
712  case TokState::Start:
713  case TokState::ParsingDone:
714  case TokState::ParsingKey:
715  case TokState::ParsingToken:
716  error = ErrInternal;
717  return false;
718  case TokState::ParsingEntity: rt = parseEntity(); break;
719  case TokState::ParsingNumericEntity: rt = parseNumericEntity(); break;
720  case TokState::ParsingNumericBaseEntity: rt = parseNumericBaseEntity(); break;
721  case TokState::ParsingNamedEntity: rt = parseNamedEntity(); break;
722  }
723  tokstate.init( TokState::ParsingToken);
724  return rt;
725  }
726 
730  bool parseToken( const IsTokenCharMap& isTok)
731  {
732  if (tokstate.id == TokState::Start)
733  {
734  m_tokenpos = m_src.getPosition();
735  tokstate.id = TokState::ParsingToken;
736  m_outputBuf.clear();
737  }
738  else if (tokstate.id != TokState::ParsingToken)
739  {
740  if (!parseTokenRecover())
741  {
742  tokstate.init();
743  return false;
744  }
745  }
746  for (;;)
747  {
751  ControlCharacter ch;
752  while (isTok[ (unsigned char)(ch=m_src.control())])
753  {
754  unsigned char aa = m_src.ascii();
755  if (aa <= 0xD)
756  {
757  //handling W3C requirements for end of line translation in XML:
758  if (aa == '\r')
759  {
760  push( (unsigned char)'\n');
761  tokstate.eolnState = TokState::CR;
762  }
763  else if (aa == '\n')
764  {
765  if (tokstate.eolnState != TokState::CR)
766  {
767  push( (unsigned char)'\n');
768  }
769  tokstate.eolnState = TokState::SRC;
770  }
771  else
772  {
773  copychar();
774  tokstate.eolnState = TokState::SRC;
775  }
776  }
777  else
778  {
779  copychar();
780  tokstate.eolnState = TokState::SRC;
781  }
782  m_src.skip();
783  }
784  if (ch == Amp)
785  {
786  m_src.skip();
787  if (!parseEntity()) break;
788  tokstate.init( TokState::ParsingToken);
789  continue;
790  }
791  else
792  {
793  tokstate.init( TokState::ParsingDone);
794  return true;
795  }
796  }
797  tokstate.init();
798  return false;
799  }
800 
801 public:
808  template <class OutputBufferType>
809  static bool parseStaticToken( const IsTokenCharMap& isTok, InputReader ir, OutputBufferType& buf)
810  {
811  static OutputCharSet output;
812  buf.clear();
813  for (;;)
814  {
815  ControlCharacter ch;
816  for (;;)
817  {
818  UChar pc;
819  if (isTok[ (unsigned char)(ch=ir.control())])
820  {
821  pc = ir.chr();
822  }
823  else if (ch == Amp)
824  {
825  pc = parseStaticNumericEntityValue( ir);
826  }
827  else
828  {
829  return true;
830  }
831  output.print( pc, buf);
832  ir.skip();
833  }
834  }
835  }
836 
837 private:
841  bool skipToken( const IsTokenCharMap& isTok)
842  {
843  do
844  {
845  ControlCharacter ch;
846  while (isTok[ (unsigned char)(ch=m_src.control())] || ch == Amp)
847  {
848  m_src.skip();
849  }
850  }
851  while (m_src.control() == Any);
852  return true;
853  }
854 
858  bool expectStr( const char* str)
859  {
860  bool rt = true;
861  tokstate.id = TokState::ParsingKey;
862  for (; str[tokstate.pos] != '\0'; m_src.skip(),tokstate.pos++)
863  {
864  if (m_src.ascii() == str[ tokstate.pos]) continue;
865  ControlCharacter ch = m_src.control();
866  if (ch == EndOfText)
867  {
868  error = ErrUnexpectedEndOfText;
869  }
870  else
871  {
872  error = ErrSyntaxToken;
873  }
874  rt = false;
875  break;
876  }
877  tokstate.init( TokState::ParsingDone);
878  return rt;
879  }
880 
884  bool pushPredefinedEntity( const char* str)
885  {
886  switch (str[0])
887  {
888  case 'q':
889  if (str[1] == 'u' && str[2] == 'o' && str[3] == 't' && str[4] == '\0')
890  {
891  push( '\"');
892  return true;
893  }
894  break;
895 
896  case 'a':
897  if (str[1] == 'm')
898  {
899  if (str[2] == 'p' && str[3] == '\0')
900  {
901  push( '&');
902  return true;
903  }
904  }
905  else if (str[1] == 'p')
906  {
907  if (str[2] == 'o' && str[3] == 's' && str[4] == '\0')
908  {
909  push( '\'');
910  return true;
911  }
912  }
913  break;
914 
915  case 'l':
916  if (str[1] == 't' && str[2] == '\0')
917  {
918  push( '<');
919  return true;
920  }
921  break;
922 
923  case 'g':
924  if (str[1] == 't' && str[2] == '\0')
925  {
926  push( '>');
927  return true;
928  }
929  break;
930 
931  case 'n':
932  if (str[1] == 'b' && str[2] == 's' && str[3] == 'p' && str[4] == '\0')
933  {
934  push( ' ');
935  return true;
936  }
937  break;
938  }
939  return false;
940  }
941 
945  bool pushEntity( const char* str)
946  {
947  if (pushPredefinedEntity( str))
948  {
949  return true;
950  }
951  else if (m_entityMap)
952  {
953  EntityMap::const_iterator itr = m_entityMap->find( str);
954  if (itr == m_entityMap->end())
955  {
957  return false;
958  }
959  else
960  {
961  UChar ch = itr->second;
962  push( ch);
963  return true;
964  }
965  }
966  else
967  {
969  return false;
970  }
971  }
972 
973 private:
974  STMState state;
975  Error error;
976  InputReader m_src;
977  const EntityMap* m_entityMap;
978  OutputBuffer m_outputBuf;
979  OutputCharSet m_output;
980  std::size_t m_tokenpos;
981 
982 public:
986  XMLScanner( const InputIterator& p_src, const EntityMap& p_entityMap)
987  :state(START),error(Ok),m_src(InputCharSet(),p_src),m_entityMap(&p_entityMap),m_output(OutputCharSet()),m_tokenpos(0)
988  {}
991  explicit XMLScanner( const InputIterator& p_src)
992  :state(START),error(Ok),m_src(InputCharSet(),p_src),m_entityMap(0),m_output(OutputCharSet()),m_tokenpos(0)
993  {}
998  XMLScanner( const InputCharSet& p_charset, const InputIterator& p_src, const EntityMap& p_entityMap)
999  :state(START),error(Ok),m_src(p_charset,p_src),m_entityMap(&p_entityMap),m_output(OutputCharSet()),m_tokenpos(0)
1000  {}
1004  XMLScanner( const InputCharSet& p_charset, const InputIterator& p_src)
1005  :state(START),error(Ok),m_src(p_charset,p_src),m_entityMap(0),m_output(OutputCharSet()),m_tokenpos(0)
1006  {}
1009  explicit XMLScanner( const InputCharSet& p_charset)
1010  :state(START),error(Ok),m_src(p_charset),m_entityMap(0),m_tokenpos(0)
1011  {}
1014  :state(START),error(Ok),m_src(InputCharSet()),m_entityMap(0),m_tokenpos(0)
1015  {}
1016 
1020  :state(o.state)
1021  ,error(o.error)
1022  ,m_src(o.m_src)
1023  ,m_entityMap(o.m_entityMap)
1024  ,m_outputBuf(o.m_outputBuf)
1025  ,m_tokenpos(o.m_tokenpos)
1026  {}
1027 
1030  template <class IteratorAssignment>
1031  void setSource( const IteratorAssignment& a)
1032  {
1033  m_src.setSource( a);
1034  }
1035 
1038  std::size_t getPosition() const
1039  {
1040  return m_src.getPosition();
1041  }
1043  std::size_t getTokenPosition() const
1044  {
1045  return m_tokenpos;
1046  }
1047 
1050  const char* getItemPtr() const {return m_outputBuf.size()?&m_outputBuf.at(0):"\0\0\0\0";}
1051 
1054  std::size_t getItemSize() const {return m_outputBuf.size();}
1055 
1058  const OutputBuffer& getItem() const
1059  {
1060  return m_outputBuf;
1061  }
1062 
1066  {
1067  static Statemachine stm;
1068  return stm.get( state);
1069  }
1070 
1074  Error getError( const char** str=0)
1075  {
1076  Error rt = error;
1077  error = Ok;
1078  if (str) *str=getErrorString(rt);
1079  return rt;
1080  }
1081 
1083  const InputIterator& getIterator() const
1084  {
1085  return m_src.getIterator();
1086  }
1087 
1089  InputIterator& getIterator()
1090  {
1091  return m_src.getIterator();
1092  }
1093 
1097  ElementType nextItem( unsigned short mask=0xFFFF)
1098  {
1099  static const IsWordCharMap wordC;
1100  static const IsContentCharMap contentC;
1101  static const IsTagCharMap tagC;
1102  static const IsSQStringCharMap sqC;
1103  static const IsDQStringCharMap dqC;
1104  static const IsTokenCharMap* tokenDefs[ NofSTMActions] = {0,&wordC,&contentC,&tagC,&sqC,&dqC,0,0,0};
1105  static const char* stringDefs[ NofSTMActions] = {0,0,0,0,0,0,"xml","CDATA",0};
1106 
1107  ElementType rt = None;
1108  ControlCharacter ch;
1109  do
1110  {
1112  if (sd->action.op != -1)
1113  {
1114  if (tokenDefs[sd->action.op])
1115  {
1116  if (tokstate.id != TokState::ParsingDone)
1117  {
1118  if ((mask&(1<<sd->action.arg)) != 0)
1119  {
1120  if (!parseToken( *tokenDefs[ sd->action.op])) return ErrorOccurred;
1121  }
1122  else
1123  {
1124  if (!skipToken( *tokenDefs[ sd->action.op])) return ErrorOccurred;
1125  }
1126  }
1127  rt = (ElementType)sd->action.arg;
1128  }
1129  else if (stringDefs[sd->action.op])
1130  {
1131  if (tokstate.id != TokState::ParsingDone)
1132  {
1133  if (!expectStr( stringDefs[sd->action.op])) return ErrorOccurred;
1134  if (sd->action.op == ExpectIdentifierXML)
1135  {
1136  //... special treatement for xml header for not
1137  // enforcing the model too much just for this case
1138  push( '?'); push( 'x'); push( 'm'); push( 'l');
1139  rt = HeaderStart;
1140  }
1141  }
1142  else if (sd->action.op == ExpectIdentifierXML)
1143  {
1144  //... special treatement for xml header for not
1145  // enforcing the model too much just for this case
1146  rt = HeaderStart;
1147  }
1148  }
1149  else
1150  {
1151  m_tokenpos = m_src.getPosition();
1152  m_outputBuf.clear();
1153  rt = (ElementType)sd->action.arg;
1154  }
1155  if (sd->nofnext == 0)
1156  {
1157  if (sd->fallbackState != -1)
1158  {
1159  state = (STMState)sd->fallbackState;
1160  }
1161  return rt;
1162  }
1163  }
1164  ch = m_src.control();
1165  tokstate.id = TokState::Start;
1166 
1167  if (sd->next[ ch] != -1)
1168  {
1169  state = (STMState)sd->next[ ch];
1170  m_src.skip();
1171  }
1172  else if (sd->fallbackState != -1)
1173  {
1174  state = (STMState)sd->fallbackState;
1175  }
1176  else if (sd->missError != -1)
1177  {
1178  error = (Error)sd->missError;
1179  return ErrorOccurred;
1180  }
1181  else if (ch == EndOfText)
1182  {
1183  error = ErrUnexpectedEndOfText;
1184  return ErrorOccurred;
1185  }
1186  else
1187  {
1188  error = ErrInternal;
1189  return ErrorOccurred;
1190  }
1191  }
1192  while (rt == None);
1193  return rt;
1194  }
1195 
1198  struct End {};
1199 
1202  class iterator
1203  {
1204  public:
1207  class Element
1208  {
1209  private:
1210  friend class iterator;
1211  ElementType m_type;
1212  const char* m_content;
1213  std::size_t m_size;
1214  public:
1217  bool valid() const {return m_type != Exit && m_type != ErrorOccurred;}
1220  const char* error() const {return m_type == ErrorOccurred ? m_content : 0;}
1223  const char* name() const {return getElementTypeName( m_type);}
1226  ElementType type() const {return m_type;}
1229  const char* content() const {return m_content;}
1232  std::size_t size() const {return m_size;}
1234  Element() :m_type(None),m_content(0),m_size(0) {}
1236  Element( const End&) :m_type(Exit),m_content(0),m_size(0) {}
1239  Element( const Element& orig) :m_type(orig.m_type),m_content(orig.m_content),m_size(orig.m_size) {}
1240  };
1241  // input iterator traits
1243  typedef std::size_t difference_type;
1244  typedef std::size_t size_type;
1245  typedef Element* pointer;
1246  typedef Element& reference;
1247  typedef std::input_iterator_tag iterator_category;
1248 
1249  private:
1250  Element element;
1251  ThisXMLScanner* input;
1252 
1256  iterator& skip( unsigned short mask=0xFFFF)
1257  {
1258  if (input != 0)
1259  {
1260  element.m_type = input->nextItem(mask);
1261  element.m_content = input->getItemPtr();
1262  element.m_size = input->getItemSize();
1263  }
1264  return *this;
1265  }
1266 
1270  bool compare( const iterator& iter) const
1271  {
1272  if (element.type() == iter.element.type())
1273  {
1274  if (element.type() == Exit || element.type() == None) return true; //equal only at beginning and end
1275  }
1276  return false;
1277  }
1278  public:
1281  void assign( const iterator& orig)
1282  {
1283  input = orig.input;
1284  element = orig.element;
1285  }
1288  iterator( const iterator& orig)
1289  {
1290  assign( orig);
1291  }
1295  iterator( ThisXMLScanner& p_input, bool doSkipToFirst=true)
1296  :input( &p_input)
1297  {
1298  if (doSkipToFirst)
1299  {
1300  element.m_type = input->nextItem();
1301  element.m_content = input->getItemPtr();
1302  element.m_size = input->getItemSize();
1303  }
1304  }
1306  iterator( const End& et) :element(et),input(0) {}
1308  iterator() :input(0) {}
1312  {
1313  assign( orig);
1314  return *this;
1315  }
1317  const Element& operator*() const
1318  {
1319  return element;
1320  }
1322  const Element* operator->() const
1323  {
1324  return &element;
1325  }
1328  iterator& operator++() {return skip();}
1331  iterator operator++(int) {iterator tmp(*this); skip(); return tmp;}
1332 
1335  bool operator==( const iterator& iter) const {return compare( iter);}
1338  bool operator!=( const iterator& iter) const {return !compare( iter);}
1339  };
1340 
1344  iterator begin( bool doSkipToFirst=true)
1345  {
1346  return iterator( *this, doSkipToFirst);
1347  }
1351  {
1352  return iterator( End());
1353  }
1354 };
1355 
1356 }//namespace
1357 #endif
1358 
1359 
uint64_t EChar
Definition: char.hpp:38
ScannerStatemachine & operator()(ControlCharacter i1, ControlCharacter i2, int ns)
See ScannerStatemachine::addTransition(ControlCharacter,int)
Definition: xmlscanner.hpp:151
end of input tag
Definition: xmlscanner.hpp:1198
iterator & operator++()
Preincrement.
Definition: xmlscanner.hpp:1328
ScannerStatemachine & fallback(int stateIdx)
See ScannerStatemachine::addFallback(int)
Definition: xmlscanner.hpp:161
Definition: xmlscanner.hpp:257
XMLScanner(const InputIterator &p_src, const EntityMap &p_entityMap)
Constructor.
Definition: xmlscanner.hpp:986
const char * getItemPtr() const
Get the current parsed XML element pointer, if it was not masked out, see nextItem(unsigned short) ...
Definition: xmlscanner.hpp:1050
expected equal in tag attribute definition
Definition: xmlscanner.hpp:218
STMState
Enumeration of states of the XML scanner state machine.
Definition: xmlscanner.hpp:255
Element * get(int stateIdx)
Get state addressed by its index.
Definition: xmlscanner.hpp:61
Definition: xmlscanner.hpp:258
Definition: xmlscanner.hpp:258
duplicate transition definition in automaton. Internal textwolf error
Definition: exception.hpp:31
static bool parseStaticToken(const IsTokenCharMap &isTok, InputReader ir, OutputBufferType &buf)
Static version of parse a token for parsing table definition elements.
Definition: xmlscanner.hpp:809
XMLScanner(const InputCharSet &p_charset)
Constructor.
Definition: xmlscanner.hpp:1009
[0] empty (NULL)
Definition: xmlscanner.hpp:175
expected end of tag
Definition: xmlscanner.hpp:217
std::size_t difference_type
Definition: xmlscanner.hpp:1243
expected an open tag in this state
Definition: xmlscanner.hpp:211
Error getError(const char **str=0)
Get the last error.
Definition: xmlscanner.hpp:1074
Element()
Constructor.
Definition: xmlscanner.hpp:1234
Definition: char.hpp:94
std::size_t size_type
Definition: xmlscanner.hpp:1244
Definition: xmlscanner.hpp:259
TextScanner & skip()
Skip to the next character of the source.
Definition: textscanner.hpp:235
Definition: xmlscanner.hpp:289
Definition: char.hpp:80
void setSource(const IteratorAssignment &a)
Assign something to the iterator while keeping the state.
Definition: textscanner.hpp:146
Definition: xmlscanner.hpp:260
Definition: char.hpp:91
Definition: xmlscanner.hpp:260
InputIterator & getIterator()
Get the iterator pointing to the current source position.
Definition: xmlscanner.hpp:1089
void setSource(const IteratorAssignment &a)
Assign something to the source iterator while keeping the state.
Definition: xmlscanner.hpp:1031
[4] tag attribute value in the XML header
Definition: xmlscanner.hpp:179
Definition: xmlscanner.hpp:258
named entity is not defined in the entity map
Definition: xmlscanner.hpp:216
Definition: char.hpp:86
ElementType
Enumeration of XML element types returned by an XML scanner.
Definition: xmlscanner.hpp:173
Definition: xmlscanner.hpp:260
iterator & operator=(const iterator &orig)
Assignement operator.
Definition: xmlscanner.hpp:1311
Base class for structures that can throw exceptions for non recoverable errors.
Definition: exception.hpp:20
Definition: xmlscanner.hpp:258
One state in the state machine.
Definition: xmlscanner.hpp:34
bool operator!=(const iterator &iter) const
Compare to check for unequality.
Definition: xmlscanner.hpp:1338
Definition: xmlscanner.hpp:260
ControlCharacter control()
Get the control character representation of the current character.
Definition: textscanner.hpp:218
ScannerStatemachine & other(int stateIdx)
See ScannerStatemachine::addOtherTransition(int)
Definition: xmlscanner.hpp:163
std::size_t getTokenPosition() const
Get the current token position.
Definition: xmlscanner.hpp:1043
bool operator==(const iterator &iter) const
Compare to check for equality.
Definition: xmlscanner.hpp:1335
invalid state definition in automaton. Internal textwolf error
Definition: exception.hpp:32
Type traits.
IsDQStringCharMap()
Definition: xmlscanner.hpp:410
ScannerStatemachine & action(int aa, int arg=0)
See ScannerStatemachine::addAction(int,int)
Definition: xmlscanner.hpp:157
Definition: char.hpp:95
XMLScanner(const InputCharSet &p_charset, const InputIterator &p_src)
Constructor.
Definition: xmlscanner.hpp:1004
Definition: xmlscanner.hpp:258
Definition: xmlscanner.hpp:259
Definition: char.hpp:92
expected mandatory end of line (after XML header)
Definition: xmlscanner.hpp:223
const Iterator & getIterator() const
Get the iterator pointing to the current source position.
Definition: textscanner.hpp:177
Definition: char.hpp:89
ScannerStatemachine & operator()(ControlCharacter inputchr)
See ScannerStatemachine::addTransition(ControlCharacter)
Definition: xmlscanner.hpp:155
IsContentCharMap()
Definition: xmlscanner.hpp:390
Definition: xmlscanner.hpp:258
IsWordCharMap()
Definition: xmlscanner.hpp:380
Definition: xmlscanner.hpp:259
Definition: xmlscanner.hpp:258
Definition: xmlscanner.hpp:258
std::input_iterator_tag iterator_category
Definition: xmlscanner.hpp:1247
static const char * getElementTypeName(ElementType ee)
Get the XML element type as string.
Definition: xmlscanner.hpp:199
Definition: xmlscanner.hpp:259
Definition: xmlscanner.hpp:259
[3] tag attribute name in the XML header
Definition: xmlscanner.hpp:178
int arg
action argument
Definition: xmlscanner.hpp:44
ScannerStatemachine::Element * getState()
Get the current XML scanner state machine state.
Definition: xmlscanner.hpp:1065
Direct copy of a character from input to output without encoding/decoding it.
no error, everything is OK
Definition: xmlscanner.hpp:209
[10] open tag (e.g. "bla" for "&lt;bla...")
Definition: xmlscanner.hpp:185
[13] content element string (separated by spaces or end of line)
Definition: xmlscanner.hpp:188
Definition: xmlscanner.hpp:257
void assign(const iterator &orig)
Assign an iterator to another.
Definition: xmlscanner.hpp:1281
Definition: xmlscanner.hpp:289
iterator(ThisXMLScanner &p_input, bool doSkipToFirst=true)
Constructor.
Definition: xmlscanner.hpp:1295
uint32_t UChar
Unicode character type.
Definition: char.hpp:37
unexpected end of input stream
Definition: xmlscanner.hpp:222
XMLScanner()
Default constructor.
Definition: xmlscanner.hpp:1013
Definition: char.hpp:82
Element value_type
Definition: xmlscanner.hpp:1242
STMAction
Enumeration of actions in the XML scanner state machine.
Definition: xmlscanner.hpp:287
Element(const Element &orig)
Copy constructor.
Definition: xmlscanner.hpp:1239
Error
Enumeration of XML scanner error codes.
Definition: xmlscanner.hpp:207
int missError
error code in case of an event that does not match and there is no fallback
Definition: xmlscanner.hpp:37
XMLScanner(const InputCharSet &p_charset, const InputIterator &p_src, const EntityMap &p_entityMap)
Constructor.
Definition: xmlscanner.hpp:998
Defines the set characters belonging to a double quoted string.
Definition: xmlscanner.hpp:408
InputCharSet_ InputCharSet
Definition: xmlscanner.hpp:480
Definition: xmlscanner.hpp:289
Definition: xmlscanner.hpp:257
Definition: xmlscanner.hpp:257
Definition: xmlscanner.hpp:257
Definition: xmlscanner.hpp:257
ScannerStatemachine & operator()(ControlCharacter i1, ControlCharacter i2, ControlCharacter i3, int ns)
See ScannerStatemachine::addTransition(ControlCharacter,int)
Definition: xmlscanner.hpp:153
Element(const End &)
Constructor.
Definition: xmlscanner.hpp:1236
Element()
Constructor.
Definition: xmlscanner.hpp:51
Definition: xmlscanner.hpp:289
Iterator element visited.
Definition: xmlscanner.hpp:1207
Definition of unicode characters.
int fallbackState
state transition if the event does not match (it belongs to the next state = fallbackState) ...
Definition: xmlscanner.hpp:36
textwolf exception class
Definition: exception.hpp:48
IsTagCharMap()
Definition: xmlscanner.hpp:369
iterator()
Constructor.
Definition: xmlscanner.hpp:1308
error in document attribute or entity definition
Definition: xmlscanner.hpp:210
Defines the set characters belonging to a single quoted string.
Definition: xmlscanner.hpp:398
XML scanner base class for things common for all XML scanners.
Definition: xmlscanner.hpp:168
[9] tag attribute value (e.g. "5" in <person id='5'>
Definition: xmlscanner.hpp:184
Definition: char.hpp:93
const InputIterator & getIterator() const
Get the iterator pointing to the current source position.
Definition: xmlscanner.hpp:1083
Action action
action executed after entering this state
Definition: xmlscanner.hpp:46
iterator(const End &et)
Constructor.
Definition: xmlscanner.hpp:1306
static const char * getErrorString(Error ee)
Get the error code as string.
Definition: xmlscanner.hpp:230
Definition: xmlscanner.hpp:260
Definition: char.hpp:84
Definition: char.hpp:85
Definition: xmlscanner.hpp:258
Element & reference
Definition: xmlscanner.hpp:1246
Defines the set of tag characters.
Definition: xmlscanner.hpp:367
Definition: xmlscanner.hpp:257
unsigned char ascii()
Get the ASCII character representation of the current character.
Definition: textscanner.hpp:227
static const char * getActionString(STMAction a)
Get the scanner state machine action as string.
Definition: xmlscanner.hpp:296
ElementType type() const
Type of the current element.
Definition: xmlscanner.hpp:1226
Interface that describes what a character set encoding implementation has to define to be used as cha...
expected CDATA tag definition
Definition: xmlscanner.hpp:220
ElementType nextItem(unsigned short mask=0xFFFF)
Scan the next XML element.
Definition: xmlscanner.hpp:1097
[6] document attribute value in a DOCTYPE or ENTITY definition
Definition: xmlscanner.hpp:181
ScannerStatemachine & operator()(ControlCharacter inputchr, int ns)
See ScannerStatemachine::addTransition(ControlCharacter,int)
Definition: xmlscanner.hpp:149
[11] close tag (e.g. "bla" for "&lt;/bla&gt;")
Definition: xmlscanner.hpp:186
[1] XML scanning error error reported
Definition: xmlscanner.hpp:176
Definition: char.hpp:96
Definition: xmlscanner.hpp:259
attribute string in XML not terminated on the same line
Definition: xmlscanner.hpp:215
Definition: xmlscanner.hpp:260
expected second '-' after '<!-' to start an XML comment as ''
Definition: xmlscanner.hpp:224
[5] end of XML header event (after parsing '?>')
Definition: xmlscanner.hpp:180
XML scanner automaton definition check failed. Labels of states must be equal to their indices...
Definition: exception.hpp:28
Definition: xmlscanner.hpp:258
ScannerStatemachine & miss(int ee)
See ScannerStatemachine::addMiss(int)
Definition: xmlscanner.hpp:159
std::size_t getPosition() const
Get the current source iterator position.
Definition: textscanner.hpp:154
Definition: char.hpp:97
Definition: xmlscanner.hpp:289
Definition: char.hpp:88
iterator begin(bool doSkipToFirst=true)
Get begin iterator.
Definition: xmlscanner.hpp:1344
const char * error() const
Return the current error.
Definition: xmlscanner.hpp:1220
Definition: xmlscanner.hpp:289
int op
action operand
Definition: xmlscanner.hpp:43
bool valid() const
Check if the element does neither mark the end of document nor reports an error occurred.
Definition: xmlscanner.hpp:1217
std::map< const char *, UChar > EntityMap
Definition: xmlscanner.hpp:487
const Element & operator*() const
Element dereference operator.
Definition: xmlscanner.hpp:1317
XML scanner template that adds the functionality to the statemachine base definition.
Definition: xmlscanner.hpp:431
Definition: xmlscanner.hpp:258
CharMap< bool, false, NofControlCharacter > IsTokenCharMap
Forms a set of characters by assigning (true/false) to the whole domain.
Definition: xmlscanner.hpp:363
std::size_t getPosition() const
Get the current source iterator position.
Definition: xmlscanner.hpp:1038
Definition: char.hpp:81
iterator end()
Get the pointer to the end of content.
Definition: xmlscanner.hpp:1350
Defines the set of content word characters (for tokenization)
Definition: xmlscanner.hpp:378
iterator operator++(int)
Postincrement.
Definition: xmlscanner.hpp:1331
std::size_t getItemSize() const
Get the size of the current parsed XML element in bytes.
Definition: xmlscanner.hpp:1054
maximum number of states (fixed allocated array for state machine)
Definition: xmlscanner.hpp:30
Definition: xmlscanner.hpp:257
XMLScanner(const XMLScanner &o)
Copy constructor.
Definition: xmlscanner.hpp:1019
memory reserved for statically allocated table or memory block is too small. Increase the size of mem...
Definition: exception.hpp:27
Definition of exceptions with containing error codes thrown by textwolf.
const char * content() const
Value of the current element.
Definition: xmlscanner.hpp:1229
[14] end of document
Definition: xmlscanner.hpp:189
std::size_t size() const
Size of the value of the current element in bytes.
Definition: xmlscanner.hpp:1232
Definition: xmlscanner.hpp:257
ControlCharacter
Enumeration of control characters needed as events for XML scanner statemachine.
Definition: char.hpp:78
UChar chr()
Get the unicode representation of the current character.
Definition: textscanner.hpp:161
Definition: xmlscanner.hpp:289
Definition: xmlscanner.hpp:257
Definition: xmlscanner.hpp:258
internal error (textwolf implementation error)
Definition: xmlscanner.hpp:221
Definition: xmlscanner.hpp:260
expected tag attribute
Definition: xmlscanner.hpp:219
Definition: xmlscanner.hpp:258
const OutputBuffer & getItem() const
Get the current parsed XML element, if it was not masked out, see nextItem(unsigned short) ...
Definition: xmlscanner.hpp:1058
XMLScanner(const InputIterator &p_src)
Constructor.
Definition: xmlscanner.hpp:991
Definition: xmlscanner.hpp:260
Definition: xmlscanner.hpp:257
Definition: char.hpp:83
Definition: xmlscanner.hpp:259
const char * name() const
Type of the current element as string.
Definition: xmlscanner.hpp:1223
OutputBuffer_ OutputBuffer
Definition: xmlscanner.hpp:488
Statemachine()
Constructor (defines the state machine completely)
Definition: xmlscanner.hpp:307
Definition: xmlscanner.hpp:258
Definition: xmlscanner.hpp:289
Definition: xmlscanner.hpp:290
Definition: char.hpp:90
number of XML element types defined
Definition: xmlscanner.hpp:193
Character map for fast typing of a character byte.
Definition: char.hpp:50
Element * pointer
Definition: xmlscanner.hpp:1245
[2] open XML header tag
Definition: xmlscanner.hpp:177
parameter check (for control character) in automaton definition failed. Internal textwolf error ...
Definition: exception.hpp:30
Class to build up the XML element scanner state machine in a descriptive way.
Definition: xmlscanner.hpp:25
parameter check (for state) in automaton definition failed. Internal textwolf error ...
Definition: exception.hpp:29
Definition: xmlscanner.hpp:258
ScannerStatemachine()
Constructor.
Definition: xmlscanner.hpp:144
const Element * operator->() const
Element dereference operator.
Definition: xmlscanner.hpp:1322
Definition: xmlscanner.hpp:257
Definition of action fired by the state machine.
Definition: xmlscanner.hpp:41
Definition: char.hpp:87
static const char * getStateString(STMState s)
Get the scanner state machine state as string.
Definition: xmlscanner.hpp:266
unexpected end of text in the middle of the XML definition
Definition: xmlscanner.hpp:213
Definition: traits.hpp:21
XML scanner state machine implementation.
Definition: xmlscanner.hpp:304
XMLScanner< InputIterator, InputCharSet_, OutputCharSet_, OutputBuffer_ > ThisXMLScanner
Definition: xmlscanner.hpp:486
OutputCharSet_ OutputCharSet
Definition: xmlscanner.hpp:481
iterator(const iterator &orig)
Copy constructor.
Definition: xmlscanner.hpp:1288
signed char next[NofControlCharacter]
follow state fired by an event (control character type parsed)
Definition: xmlscanner.hpp:48
TextScanner< InputIterator, InputCharSet_ > InputReader
Definition: xmlscanner.hpp:482
unsigned char nofnext
number of follow states defined
Definition: xmlscanner.hpp:47
Definition: xmlscanner.hpp:257
Defines the set of content token characters.
Definition: xmlscanner.hpp:388
Definition: xmlscanner.hpp:257
Implementation of iterator for character-wise parsing of input.
Definition: xmlscanner.hpp:260
input iterator for iterating on the output of an XML scanner
Definition: xmlscanner.hpp:1202
Definition: xmlscanner.hpp:257
a specific string expected as token in XML but does not match
Definition: xmlscanner.hpp:214
expected an <?xml tag in this state
Definition: xmlscanner.hpp:212
ScannerStatemachine & operator[](int stateIdx)
See ScannerStatemachine::newState(int)
Definition: xmlscanner.hpp:147
Definition: xmlscanner.hpp:258
void copychar(CharSet &output_, Buffer &buf_)
Definition: textscanner.hpp:192
IsSQStringCharMap()
Definition: xmlscanner.hpp:400
Definition: xmlscanner.hpp:259
[8] tag attribute name (e.g. "id" in <person id='5'>
Definition: xmlscanner.hpp:183
[7] end of a document attribute definition <! .. !>
Definition: xmlscanner.hpp:182
[12] immediate close tag (e.g. "bla" for "&lt;bla /&gt;")
Definition: xmlscanner.hpp:187
Definition: char.hpp:99