textwolf  0.2
charset_utf16.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014 Patrick P. Frey
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  */
10 
11 #ifndef __TEXTWOLF_CHARSET_UTF16_HPP__
12 #define __TEXTWOLF_CHARSET_UTF16_HPP__
13 #include "textwolf/char.hpp"
15 #include "textwolf/exception.hpp"
16 #include <cstddef>
17 
18 namespace textwolf {
19 namespace charset {
20 
31 template <int encoding=ByteOrder::BE>
32 class UTF16
33 {
34 private:
35  enum
36  {
37  LSB=(encoding==ByteOrder::BE), //< least significant byte index (0 or 1)
38  MSB=(encoding==ByteOrder::LE), //< most significant byte index (0 or 1)
39  Print1shift=(encoding==ByteOrder::BE)?8:0, //< value to shift with to get the 1st character to print
40  Print2shift=(encoding==ByteOrder::LE)?8:0 //< value to shift with to get the 2nd character to print
41  };
42 
43 public:
44  enum
45  {
46  MaxChar=0x10FFFFU //< maximum character in alphabet
47  };
48 
49 public:
51  template <class Iterator>
52  static inline void fetchbytes( char* buf, unsigned int& bufpos, Iterator& itr)
53  {
54  if (bufpos<2)
55  {
56  if (bufpos<1)
57  {
58  buf[0] = *itr;
59  ++itr;
60  ++bufpos;
61  }
62  buf[1] = *itr;
63  ++itr;
64  ++bufpos;
65  }
66  }
67 
72  template <class Iterator>
73  static inline unsigned int size( char* buf, unsigned int& bufpos, Iterator& itr)
74  {
75  fetchbytes( buf, bufpos, itr);
76 
77  UChar rt = (unsigned char)buf[ MSB];
78  if ((rt - 0xD8) > 0x03)
79  {
80  return 2;
81  }
82  else
83  {
84  return 4;
85  }
86  }
87 
89  template <class Iterator>
90  static inline void skip( char* buf, unsigned int& bufpos, Iterator& itr)
91  {
92  unsigned int bufsize = size( buf, bufpos, itr);
93  for (;bufpos < bufsize; ++bufpos)
94  {
95  ++itr;
96  }
97  }
98 
100  template <class Iterator>
101  static inline signed char asciichar( char* buf, unsigned int& bufpos, Iterator& itr)
102  {
103  UChar ch = value_impl( buf, bufpos, itr);
104  return (ch > 127)?-1:(char)ch;
105  }
106 
108  template <class Iterator>
109  static UChar value_impl( char* buf, unsigned int& bufpos, Iterator& itr)
110  {
111  unsigned int bufsize = size( buf, bufpos, itr);
112 
113  UChar rt = (unsigned char)buf[ MSB];
114  rt = (rt << 8) + (unsigned char)buf[ LSB];
115 
116  if (bufsize == 4)
117  {
118  // 2 teilig
119  while (bufpos < bufsize)
120  {
121  buf[bufpos] = *itr;
122  ++itr;
123  ++bufpos;
124  }
125  rt -= 0xD800;
126  rt *= 0x400;
127  unsigned short lo = (unsigned char)buf[ 2+MSB];
128  if ((lo - 0xDC) > 0x03) return 0xFFFF;
129  lo = (lo << 8) + (unsigned char)buf[ 2+LSB];
130  return rt + lo - 0xDC00 + 0x010000;
131  }
132  return rt;
133  }
134 
135  template <class Iterator>
136  inline UChar value( char* buf, unsigned int& bufpos, Iterator& itr) const
137  {
138  return value_impl( buf, bufpos, itr);
139  }
140 
142  template <class Buffer_>
143  void print( UChar ch, Buffer_& buf) const
144  {
145  if (ch <= 0xFFFF)
146  {
147  if ((ch - 0xD800) < 0x400)
148  {
149  //... reserved for encoding of characters in range [0xFFFF..0x10FFFF]
150  }
151  else
152  {
153  buf.push_back( (char)(unsigned char)((ch >> Print1shift) & 0xFF));
154  buf.push_back( (char)(unsigned char)((ch >> Print2shift) & 0xFF));
155  return;
156  }
157  }
158  else if (ch <= 0x10FFFF)
159  {
160  ch -= 0x10000;
161  unsigned short hi = (unsigned short )((ch / 0x400) + 0xD800);
162  unsigned short lo = (unsigned short )((ch % 0x400) + 0xDC00);
163  buf.push_back( (char)(unsigned char)((hi >> Print1shift) & 0xFF));
164  buf.push_back( (char)(unsigned char)((hi >> Print2shift) & 0xFF));
165  buf.push_back( (char)(unsigned char)((lo >> Print1shift) & 0xFF));
166  buf.push_back( (char)(unsigned char)((lo >> Print2shift) & 0xFF));
167  return;
168  }
169  char tb[ 32];
170  char* cc = tb;
171  Encoder::encode( ch, tb, sizeof(tb));
172  while (*cc)
173  {
174  buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print1shift) & 0xFF));
175  buf.push_back( (char)(unsigned char)(((UChar)*cc >> Print2shift) & 0xFF));
176  ++cc;
177  }
178  }
179 
181  static inline bool is_equal( const UTF16&, const UTF16&)
182  {
183  return true;
184  }
185 };
186 
189 struct UTF16LE :public UTF16<ByteOrder::LE> {};
192 struct UTF16BE :public UTF16<ByteOrder::BE> {};
193 
194 }//namespace
195 }//namespace
196 #endif
197 
UTF-16 little endian character set encoding.
Definition: charset_utf16.hpp:189
static void fetchbytes(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::fetchbytes(char*,unsigned int&,Iterator&)
Definition: charset_utf16.hpp:52
static UChar value_impl(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::value(char*,unsigned int&,Iterator&)
Definition: charset_utf16.hpp:109
static bool encode(UChar chr, char *bufptr, std::size_t bufsize)
Write the character 'chr' in encoded form as nul-terminated string to a buffer.
Definition: charset_interface.hpp:29
Definition: charset_interface.hpp:119
static void skip(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::skip(char*,unsigned int&,Iterator&)
Definition: charset_utf16.hpp:90
Definition: charset_interface.hpp:120
uint32_t UChar
Unicode character type.
Definition: char.hpp:37
Character set UTF16 (little/big endian)
Definition: charset_utf16.hpp:32
Definition of unicode characters.
static unsigned int size(char *buf, unsigned int &bufpos, Iterator &itr)
Get the size of the current character in bytes (variable length encoding)
Definition: charset_utf16.hpp:73
Interface that describes what a character set encoding implementation has to define to be used as cha...
static bool is_equal(const UTF16 &, const UTF16 &)
See template<class Buffer>Interface::is_equal( const Interface&, const Interface&) ...
Definition: charset_utf16.hpp:181
UChar value(char *buf, unsigned int &bufpos, Iterator &itr) const
Definition: charset_utf16.hpp:136
Definition of exceptions with containing error codes thrown by textwolf.
Definition: charset_utf16.hpp:46
void print(UChar ch, Buffer_ &buf) const
See template<class Buffer>Interface::print(UChar,Buffer&)
Definition: charset_utf16.hpp:143
UTF-16 big endian character set encoding.
Definition: charset_utf16.hpp:192
static signed char asciichar(char *buf, unsigned int &bufpos, Iterator &itr)
See template<class Iterator>Interface::asciichar(char*,unsigned int&,Iterator&)
Definition: charset_utf16.hpp:101