textwolf  0.2
textscanner.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
8 #ifndef __TEXTWOLF_TEXT_SCANNER_HPP__
9 #define __TEXTWOLF_TEXT_SCANNER_HPP__
10 
13 #include "textwolf/char.hpp"
15 #include "textwolf/exception.hpp"
19 #include <cstddef>
20 
21 namespace textwolf {
22 
23 template <typename Iterator>
24 struct Traits{};
25 
26 template <>
27 struct Traits<char*>
28 {
29  static inline std::size_t getPosition( const char* start, char const* itr)
30  {
31  return itr-start;
32  }
33 };
34 
35 template <>
37 {
38  static inline std::size_t getPosition( const SrcIterator&, const SrcIterator& itr)
39  {
40  return itr.position();
41  }
42 };
43 
44 template <>
46 {
47  static inline std::size_t getPosition( const IStreamIterator&, const IStreamIterator& itr)
48  {
49  return itr.position();
50  }
51 };
52 
53 template <>
55 {
56  static inline std::size_t getPosition( const CStringIterator&, const CStringIterator& itr)
57  {
58  return itr.pos();
59  }
60 };
61 
62 
67 template <typename Iterator, class CharSet>
69 {
70 private:
71  Iterator start;
72  Iterator input;
73  char buf[8];
74  UChar val;
75  signed char cur;
76  unsigned int state;
77  CharSet charset;
78 
79 public:
82  struct ControlCharMap :public CharMap<ControlCharacter,Undef>
83  {
85  {
86  (*this)
87  (0,EndOfText)
88  (1,31,Cntrl)
89  (5,Undef)
90  (33,127,Any)
91  (128,255,Undef)
92  ('\t',Space)
93  ('\r',Space)
94  ('\n',EndOfLine)
95  (' ',Space)
96  ('&',Amp)
97  ('<',Lt)
98  ('=',Equal)
99  ('>',Gt)
100  ('/',Slash)
101  ('-',Dash)
102  ('!',Exclam)
103  ('?',Questm)
104  ('\'',Sq)
105  ('\"',Dq)
106  ('[',Osb)
107  (']',Csb);
108  }
109  };
110 
112  TextScanner( const CharSet& charset_)
113  :val(0),cur(0),state(0),charset(charset_)
114  {
115  for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
116  }
117 
118  TextScanner( const CharSet& charset_, const Iterator& p_iterator)
119  :start(p_iterator),input(p_iterator),val(0),cur(0),state(0),charset(charset_)
120  {
121  for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
122  }
123 
124  TextScanner( const Iterator& p_iterator)
125  :start(p_iterator),input(p_iterator),val(0),cur(0),state(0),charset(CharSet())
126  {
127  for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii] = 0;
128  }
129 
132  TextScanner( const TextScanner& orig)
133  :start(orig.start)
134  ,input(orig.input)
135  ,val(orig.val)
136  ,cur(orig.cur)
137  ,state(orig.state)
138  ,charset(orig.charset)
139  {
140  for (unsigned int ii=0; ii<sizeof(buf); ii++) buf[ii]=orig.buf[ii];
141  }
142 
145  template <class IteratorAssignment>
146  void setSource( const IteratorAssignment& a)
147  {
148  input = a;
149  start = a;
150  }
151 
154  std::size_t getPosition() const
155  {
156  return Traits<Iterator>::getPosition( start, input) - state;
157  }
158 
161  inline UChar chr()
162  {
163  if (val == 0)
164  {
165  val = charset.value( buf, state, input);
166  }
167  return val;
168  }
169 
171  inline void getcur()
172  {
173  cur = CharSet::asciichar( buf, state, input);
174  }
175 
177  inline const Iterator& getIterator() const
178  {
179  return input;
180  }
181 
183  inline Iterator& getIterator()
184  {
185  return input;
186  }
187 
191  template <class Buffer>
192  inline void copychar( CharSet& output_, Buffer& buf_)
193  {
195  if (CharSet::is_equal( charset, output_))
196  {
197  // ... if the character sets are equal and of the same subclass (code pages)
198  // then we do not decode/encode the character but copy it directly to the output
199  charset.fetchbytes( buf, state, input);
200 #ifdef __GNUC__
201 #if (__GNUC__ >= 5 && __GNUC_MINOR__ >= 0)
202  for (unsigned int ii=0; ii<8 && ii<state; ++ii) buf_.push_back(buf[ii]);
203 #else
204  for (unsigned int ii=0; ii<state; ++ii) buf_.push_back(buf[ii]);
205 #endif
206 #else
207  for (unsigned int ii=0; ii<state; ++ii) buf_.push_back(buf[ii]);
208 #endif
209  }
210  else
211  {
212  output_.print( chr(), buf_);
213  }
214  }
215 
219  {
220  static ControlCharMap controlCharMap;
221  getcur();
222  return controlCharMap[ (unsigned char)cur];
223  }
224 
227  inline unsigned char ascii()
228  {
229  getcur();
230  return cur>=0?(unsigned char)cur:0;
231  }
232 
235  inline TextScanner& skip()
236  {
237  CharSet::skip( buf, state, input);
238  state = 0;
239  cur = 0;
240  val = 0;
241  return *this;
242  }
243 
245  inline UChar operator*()
246  {
247  return chr();
248  }
249 
252  inline TextScanner& operator ++() {return skip();}
253 
256  inline TextScanner operator ++(int) {TextScanner tmp(*this); skip(); return tmp;}
257 };
258 
259 }//namespace
260 #endif
static std::size_t getPosition(const char *start, char const *itr)
Definition: textscanner.hpp:29
Definition: char.hpp:94
TextScanner(const CharSet &charset_)
Constructor.
Definition: textscanner.hpp:112
TextScanner & skip()
Skip to the next character of the source.
Definition: textscanner.hpp:235
Definition: char.hpp:80
void setSource(const IteratorAssignment &a)
Assign something to the iterator while keeping the state.
Definition: textscanner.hpp:146
Definition: char.hpp:91
PositionIndex position() const
Definition: istreamiterator.hpp:151
Input iterator as source for the XML scanner with the possibility of being fed chunk by chunk...
Definition: sourceiterator.hpp:25
ControlCharMap()
Definition: textscanner.hpp:84
Definition: char.hpp:86
PositionIndex position() const
Definition: sourceiterator.hpp:112
ControlCharacter control()
Get the control character representation of the current character.
Definition: textscanner.hpp:218
Definition of iterators for textwolf on an input stream class.
Definition: char.hpp:95
Definition: char.hpp:92
Map of ASCII characters to control character identifiers used in the XML scanner automaton.
Definition: textscanner.hpp:82
const Iterator & getIterator() const
Get the iterator pointing to the current source position.
Definition: textscanner.hpp:177
Definition: char.hpp:89
uint32_t UChar
Unicode character type.
Definition: char.hpp:37
Definition: char.hpp:82
TextScanner(const Iterator &p_iterator)
Definition: textscanner.hpp:124
UChar operator*()
see TextScanner::chr()
Definition: textscanner.hpp:245
Definition of unicode characters.
void getcur()
Fill the internal buffer with as many current character bytes needed for reading the ASCII representa...
Definition: textscanner.hpp:171
textwolf byte source iterator template
static std::size_t getPosition(const IStreamIterator &, const IStreamIterator &itr)
Definition: textscanner.hpp:47
Definition: char.hpp:93
Definition: char.hpp:84
Definition: char.hpp:85
unsigned char ascii()
Get the ASCII character representation of the current character.
Definition: textscanner.hpp:227
Interface that describes what a character set encoding implementation has to define to be used as cha...
Definition: textscanner.hpp:24
Definition: char.hpp:96
std::size_t getPosition() const
Get the current source iterator position.
Definition: textscanner.hpp:154
Definition: char.hpp:97
Definition: char.hpp:88
Definition: char.hpp:81
Definition of exceptions with containing error codes thrown by textwolf.
ControlCharacter
Enumeration of control characters needed as events for XML scanner statemachine.
Definition: char.hpp:78
UChar chr()
Get the unicode representation of the current character.
Definition: textscanner.hpp:161
Reader for scanning the input character by character.
Definition: textscanner.hpp:68
Input iterator on a constant string returning null characters after EOF as required by textwolf scann...
Definition: cstringiterator.hpp:23
Input iterator on an STL input stream.
Definition: istreamiterator.hpp:95
TextScanner(const CharSet &charset_, const Iterator &p_iterator)
Definition: textscanner.hpp:118
textwolf iterator on strings
Definition: char.hpp:83
TextScanner & operator++()
Preincrement: Skip to the next character of the source.
Definition: textscanner.hpp:252
TextScanner(const TextScanner &orig)
Copy constructor.
Definition: textscanner.hpp:132
unsigned int pos() const
Return current char position.
Definition: cstringiterator.hpp:76
Definition: char.hpp:90
Character map for fast typing of a character byte.
Definition: char.hpp:50
Definition: char.hpp:87
static std::size_t getPosition(const SrcIterator &, const SrcIterator &itr)
Definition: textscanner.hpp:38
Iterator & getIterator()
Get the iterator pointing to the current source position.
Definition: textscanner.hpp:183
static std::size_t getPosition(const CStringIterator &, const CStringIterator &itr)
Definition: textscanner.hpp:56
void copychar(CharSet &output_, Buffer &buf_)
Definition: textscanner.hpp:192