textwolf  0.2
xmlhdrparser.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 
11 #ifndef __TEXTWOLF_XML_HEADER_PARSER_HPP__
12 #define __TEXTWOLF_XML_HEADER_PARSER_HPP__
13 #include "textwolf/exception.hpp"
15 #include <cstdlib>
16 
19 namespace textwolf {
20 
25  :public throws_exception
26 {
27 public:
30  :m_state(Init)
31  ,m_attributetype(Encoding)
32  ,m_idx(0)
33  ,m_charsConsumed(0)
34  ,m_zeroCount(0){}
35 
39  :m_state(o.m_state)
40  ,m_attributetype(o.m_attributetype)
41  ,m_idx(o.m_idx)
42  ,m_charsConsumed(o.m_charsConsumed)
43  ,m_zeroCount(o.m_zeroCount)
44  ,m_item(o.m_item)
45  ,m_src(o.m_src){}
46 
47 
51  void putInput( const char* src_, std::size_t srcsize_)
52  {
53  m_src.append( src_, srcsize_);
54  }
55 
58  const std::string& consumedData() const
59  {
60  return m_src;
61  }
62 
65  bool parse()
66  {
67  unsigned char ch = nextChar();
68  for (;ch != 0; ch = nextChar())
69  {
70  switch (m_state)
71  {
72  case Init:
73  if (ch == '<')
74  {
75  m_state = ParseXmlOpen;
76  }
77  else if (ch <= 32)
78  {
79  continue;
80  }
81  else
82  {
83  setError( "expected open tag angle bracket '>'");
84  return false;
85  }
86  break;
87 
88  case ParseXmlOpen:
89  if (ch == '?')
90  {
91  m_state = ParseXmlHdr;
92  }
93  else if (ch <= 32)
94  {
95  break;
96  }
97  else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
98  {
99  return true;
100  }
101  else
102  {
103  setError( "expected xml header question mark '?' after open tag angle bracket '<'");
104  return false;
105  }
106  break;
107 
108  case ParseXmlHdr:
109  if (ch <= 32 || ch == '?')
110  {
111  if (m_item != "xml")
112  {
113  setError( "expected '<?xml' as xml header start");
114  return false;
115  }
116  m_item.clear();
117  if (ch == '?') return true; /*...."<?xml?>"*/
118 
119  m_state = FindAttributeName;
120  }
121  else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
122  {
123  m_item.push_back(ch);
124  continue;
125  }
126  else if (ch == '>')
127  {
128  setError( "unexpected close angle bracket '>' in xml header after '<?xml'");
129  return false;
130  }
131  else
132  {
133  setError( "expected '<?xml' as xml header start (invalid character)");
134  return false;
135  }
136  break;
137 
138  case FindAttributeName:
139  if (ch <= 32)
140  {
141  continue;
142  }
143  else if (ch == '>' || ch == '?')
144  {
145  if (ch == '>')
146  {
147  setError( "unexpected close angle bracket '>' in xml header (missing '?')");
148  return false;
149  }
150  return true;
151  }
152  else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
153  {
154  m_item.push_back(ch);
155  m_state = ParseAttributeName;
156  }
157  else
158  {
159  setError( "invalid character in xml header attribute name");
160  return false;
161  }
162  break;
163  case ParseAttributeName:
164  if (ch <= 32 || ch == '=')
165  {
166  if (m_item == "encoding")
167  {
168  m_attributetype = Encoding;
169  }
170  else if (m_item == "version")
171  {
172  m_attributetype = Version;
173  }
174  else if (m_item == "standalone")
175  {
176  m_attributetype = Standalone;
177  }
178  else
179  {
180  setError( "unknown xml header attribute name");
181  return false;
182  }
183  m_item.clear();
184  if (ch == '=')
185  {
186  m_state = FindAttributeValue;
187  continue;
188  }
189  m_state = FindAttributeAssign;
190  }
191  else if (((ch|32) >= 'a' && (ch|32) <= 'z') || ch == '_')
192  {
193  m_item.push_back(ch);
194  continue;
195  }
196  else
197  {
198  setError( "invalid character in xml header attribute name");
199  return false;
200  }
201  break;
202  case FindAttributeAssign:
203  if (ch == '=')
204  {
205  m_state = FindAttributeValue;
206  }
207  else if (ch <= 32)
208  {
209  continue;
210  }
211  else
212  {
213  setError( "expected '=' after xml header attribute name");
214  return false;
215  }
216  break;
217  case FindAttributeValue:
218  if (ch == '"')
219  {
220  m_state = ParseAttributeValueDq;
221  continue;
222  }
223  else if (ch == '\'')
224  {
225  m_state = ParseAttributeValueSq;
226  continue;
227  }
228  else if (ch <= 32)
229  {
230  continue;
231  }
232  else
233  {
234  setError( "expected single or double quote string as xml header attribute value");
235  return false;
236  }
237  break;
238  case ParseAttributeValueSq:
239  if (ch == '\'')
240  {
241  switch (m_attributetype)
242  {
243  case Encoding:
244  m_encoding = m_item;
245  break;
246  case Version:
247  case Standalone:
248  break;
249  }
250  m_item.clear();
251  m_state = FindAttributeName;
252  continue;
253  }
254  else
255  {
256  m_item.push_back( ch);
257  }
258  break;
259  case ParseAttributeValueDq:
260  if (ch == '\"')
261  {
262  switch (m_attributetype)
263  {
264  case Encoding:
265  m_encoding = m_item;
266  break;
267  case Version:
268  case Standalone:
269  break;
270  }
271  m_item.clear();
272  m_state = FindAttributeName;
273  continue;
274  }
275  else
276  {
277  m_item.push_back( ch);
278  }
279  break;
280  }/*switch(..)*/
281  }/*for(;..;..)*/
282  return false;
283  }
284 
287  const char* lasterror() const
288  {
289  return m_lasterror.empty()?0:m_lasterror.c_str();
290  }
291 
294  const char* encoding() const
295  {
296  return m_encoding.empty()?0:m_encoding.c_str();
297  }
298 
301  std::size_t charsConsumed() const
302  {
303  return m_charsConsumed;
304  }
305 
307  void clear()
308  {
309  m_state = Init;
310  m_attributetype = Encoding;
311  m_idx = 0;
312  m_charsConsumed = 0;
313  m_zeroCount = 0;
314  m_item.clear();
315  m_src.clear();
316  m_encoding.clear();
317  m_lasterror.clear();
318  }
319 
320 private:
321  void setError( const std::string& m)
322  {
323  m_lasterror = m;
324  }
325 
326  unsigned char nextChar()
327  {
328  for (; m_zeroCount<4; m_zeroCount++)
329  {
330  if (m_idx >= m_src.size()) return 0;
331  unsigned char ch = m_src[m_idx];
332  ++m_idx;
333  if (ch != 0)
334  {
335  m_zeroCount = 0;
336  if (ch > 32)
337  {
338  ++m_charsConsumed;
339  }
340  return ch;
341  }
342  }
343  throw exception( IllegalXmlHeader);
344  }
345 
346  enum State
347  {
348  Init,
349  ParseXmlOpen,
350  ParseXmlHdr,
351  FindAttributeName,
352  ParseAttributeName,
353  FindAttributeAssign,
354  FindAttributeValue,
355  ParseAttributeValueSq,
356  ParseAttributeValueDq
357  };
358 
359  enum AttributeType
360  {
361  Encoding,
362  Version,
363  Standalone
364  };
365 
366  static const char* stateName( State i)
367  {
368  static const char* ar[] = {"Init","ParseXmlOpen","ParseXmlHdr","FindAttributeName","ParseAttributeName","FindAttributeAssign","FindAttributeValue","ParseAttributeValueSq","ParseAttributeValueDq"};
369  return ar[ (int)i];
370  }
371 
372 private:
373  State m_state;
374  AttributeType m_attributetype;
375  std::size_t m_idx;
376  std::size_t m_charsConsumed;
377  std::size_t m_zeroCount;
378  std::string m_item;
379  std::string m_src;
380  std::string m_encoding;
381  std::string m_lasterror;
382 };
383 
384 }//namespace
385 #endif
386 
std::size_t charsConsumed() const
Get the number of ASCII characters consumed.
Definition: xmlhdrparser.hpp:301
const char * encoding() const
Get the encoding specified as attribute in the header.
Definition: xmlhdrparser.hpp:294
Base class for structures that can throw exceptions for non recoverable errors.
Definition: exception.hpp:20
illegal XML header (more than 4 null bytes in a row). Usage error
Definition: exception.hpp:39
Class for parsing the header to get the character set encoding.
Definition: xmlhdrparser.hpp:24
textwolf byte source iterator template
const char * lasterror() const
Get the last error occurred.
Definition: xmlhdrparser.hpp:287
bool parse()
Call the first/next iteration of parsing the header.
Definition: xmlhdrparser.hpp:65
XmlHdrParser()
Constructor.
Definition: xmlhdrparser.hpp:29
Definition of exceptions with containing error codes thrown by textwolf.
XmlHdrParser(const XmlHdrParser &o)
Copy constructor.
Definition: xmlhdrparser.hpp:38
void putInput(const char *src_, std::size_t srcsize_)
Add another input chunk to process.
Definition: xmlhdrparser.hpp:51
const std::string & consumedData() const
Get the whole original data added with subsequent calls of putInput(const char*,std::size_t) ...
Definition: xmlhdrparser.hpp:58
void clear()
Clear the data, reset the state.
Definition: xmlhdrparser.hpp:307