textwolf  0.2
charset_utf8.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 
11 #ifndef __TEXTWOLF_CHARSET_UTF8_HPP__
12 #define __TEXTWOLF_CHARSET_UTF8_HPP__
13 #include "textwolf/char.hpp"
15 #include "textwolf/exception.hpp"
16 #include <cstddef>
17 
18 namespace textwolf {
19 namespace charset {
20 
23 struct UTF8
24 {
26  enum {MaxChar=0x7FFFFFFFU};
27  enum {
28  B11111111=0xFF,
29  B01111111=0x7F,
30  B00111111=0x3F,
31  B00011111=0x1F,
32  B00001111=0x0F,
33  B00000111=0x07,
34  B00000011=0x03,
35  B00000001=0x01,
36  B00000000=0x00,
37  B10000000=0x80,
38  B11000000=0xC0,
39  B11100000=0xE0,
40  B11110000=0xF0,
41  B11111000=0xF8,
42  B11111100=0xFC,
43  B11111110=0xFE,
44 
50  };
51 
54  struct CharLengthTab :public CharMap<unsigned char, 0>
55  {
57  {
58  (*this)
66  (B11111111,B11111111,8);
67  }
68  };
69 
74  template <class Iterator>
75  static inline unsigned int size( char* buf, unsigned int& bufpos, Iterator& itr)
76  {
77  static CharLengthTab charLengthTab;
78  if (bufpos==0)
79  {
80  buf[0] = *itr;
81  ++itr;
82  ++bufpos;
83  }
84  return charLengthTab[ (unsigned char)buf[ 0]];
85  }
86 
88  template <class Iterator>
89  static inline void skip( char* buf, unsigned int& bufpos, Iterator& itr)
90  {
91  unsigned int bufsize = size( buf, bufpos, itr);
92  for (;bufpos < bufsize; ++bufpos)
93  {
94  ++itr;
95  }
96  }
97 
99  template <class Iterator>
100  static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
101  {
102  if (bufpos==0)
103  {
104  buf[0] = *itr;
105  ++itr;
106  ++bufpos;
107  }
108  return ((unsigned char)(buf[0])>127)?-1:buf[0];
109  }
110 
112  template <class Iterator>
113  static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
114  {
115  if (bufpos==0)
116  {
117  buf[0] = *itr;
118  ++itr;
119  ++bufpos;
120  }
121  unsigned int bufsize = size( buf, bufpos, itr);
122  for (;bufpos < bufsize; ++bufpos)
123  {
124  buf[ bufpos] = *itr;
125  ++itr;
126  }
127  }
128 
130  template <class Iterator>
131  UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
132  {
133  fetchbytes( buf, bufpos, itr);
134 
135  UChar res = (unsigned char)buf[0];
136  if (res > 127)
137  {
138  int gg = bufpos-2;
139  if (gg < 0) return MaxChar;
140 
141  res = ((unsigned char)buf[0])&(B00011111>>gg);
142  for (int ii=0; ii<=gg; ii++)
143  {
144  unsigned char xx = (unsigned char)buf[ii+1];
145  res = (res<<6) | (xx & B00111111);
146  if ((unsigned char)(xx & B11000000) != B10000000)
147  {
148  return MaxChar;
149  }
150  }
151  }
152  return res;
153  }
154 
156  template <class Buffer_>
157  void print( UChar chr, Buffer_& buf) const
158  {
159  unsigned int rt;
160  if (chr <= 127)
161  {
162  buf.push_back( (char)(unsigned char)chr);
163  return;
164  }
165  unsigned int pp,sf;
166  for (pp=1,sf=5; pp<5; pp++,sf+=5)
167  {
168  if (chr < (unsigned int)((1<<6)<<sf)) break;
169  }
170  rt = pp+1;
171  unsigned char HB = (unsigned char)(B11111111 << (8-rt));
172  unsigned char shf = (unsigned char)(pp*6);
173  unsigned int ii;
174  buf.push_back( (char)(((unsigned char)(chr >> shf) & (~HB >> 1)) | HB));
175  for (ii=1,shf-=6; ii<=pp; shf-=6,ii++)
176  {
177  buf.push_back( (char)(unsigned char) (((chr >> shf) & B00111111) | B10000000));
178  }
179  }
180 
182  static bool is_equal( const UTF8&, const UTF8&)
183  {
184  return true;
185  }
186 };
187 
188 }//namespace
189 }//namespace
190 #endif
191 
Definition: charset_utf8.hpp:38
Definition: charset_utf8.hpp:45
Definition: charset_utf8.hpp:26
static void skip(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
Definition: charset_utf8.hpp:89
Definition: charset_utf8.hpp:46
static void fetchbytes(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::fetch(char*,unsigned int&,Iterator&)
Definition: charset_utf8.hpp:113
Definition: charset_utf8.hpp:33
Definition: charset_utf8.hpp:48
Definition: charset_utf8.hpp:40
Table that maps the first UTF-8 character byte to the length of the character in bytes.
Definition: charset_utf8.hpp:54
uint32_t UChar
Unicode character type.
Definition: char.hpp:37
Definition: charset_utf8.hpp:47
Definition: charset_utf8.hpp:34
Definition: charset_utf8.hpp:28
Definition of unicode characters.
UChar value(char *buf, unsigned int &bufpos, Iterator &itr) const
See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
Definition: charset_utf8.hpp:131
Definition: charset_utf8.hpp:32
static bool is_equal(const UTF8 &, const UTF8 &)
See template<class Buffer>Interface::is_equal( const Interface&, const Interface&) ...
Definition: charset_utf8.hpp:182
Interface that describes what a character set encoding implementation has to define to be used as cha...
Definition: charset_utf8.hpp:42
Definition: charset_utf8.hpp:43
Definition: charset_utf8.hpp:29
void print(UChar chr, Buffer_ &buf) const
See template<class Buffer>Interface::print(UChar,Buffer&)
Definition: charset_utf8.hpp:157
Definition: charset_utf8.hpp:39
Definition of exceptions with containing error codes thrown by textwolf.
Definition: charset_utf8.hpp:41
static unsigned int size(char *buf, unsigned int &bufpos, Iterator &itr)
Get the size of the current character in bytes (variable length encoding)
Definition: charset_utf8.hpp:75
Character map for fast typing of a character byte.
Definition: char.hpp:50
Definition: charset_utf8.hpp:49
Definition: charset_utf8.hpp:31
static signed char asciichar(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
Definition: charset_utf8.hpp:100
Definition: charset_utf8.hpp:35
CharLengthTab()
Definition: charset_utf8.hpp:56
Definition: charset_utf8.hpp:37
Definition: charset_utf8.hpp:30
character set encoding UTF-8
Definition: charset_utf8.hpp:23
Definition: charset_utf8.hpp:36