31 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
32 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
34 #include <boost/cstdint.hpp>
43 typedef boost::uint8_t uint8_t;
44 typedef boost::uint16_t uint16_t;
45 typedef boost::uint32_t uint32_t;
53 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
54 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
55 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
56 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
57 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
58 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
61 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
63 template<
typename octet_type>
64 inline uint8_t mask8(octet_type oc)
66 return static_cast<uint8_t
>(0xff & oc);
68 template<
typename u16_type>
69 inline uint16_t mask16(u16_type oc)
71 return static_cast<uint16_t
>(0xffff & oc);
73 template<
typename octet_type>
74 inline bool is_trail(octet_type oc)
76 return ((mask8(oc) >> 6) == 0x2);
79 template <
typename u16>
80 inline bool is_surrogate(u16 cp)
82 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
85 template <
typename u32>
86 inline bool is_code_point_valid(u32 cp)
88 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
91 template <
typename octet_iterator>
92 inline typename std::iterator_traits<octet_iterator>::difference_type
93 sequence_length(octet_iterator lead_it)
95 uint8_t lead = mask8(*lead_it);
98 else if ((lead >> 5) == 0x6)
100 else if ((lead >> 4) == 0xe)
102 else if ((lead >> 3) == 0x1e)
108 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
110 template <
typename octet_iterator>
111 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
113 uint32_t cp = mask8(*it);
115 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
116 octet_difference_type length = sequence_length(it);
127 return NOT_ENOUGH_ROOM;
131 if (std::distance(it, end) < length)
132 return NOT_ENOUGH_ROOM;
140 if (is_trail(*(++it))) {
141 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
145 return INCOMPLETE_SEQUENCE;
149 if (is_trail(*(++it))) {
150 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
151 if (is_trail(*(++it))) {
155 std::advance(it, -2);
156 return INCOMPLETE_SEQUENCE;
161 return INCOMPLETE_SEQUENCE;
165 if (is_trail(*(++it))) {
166 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
167 if (is_trail(*(++it))) {
168 cp += (mask8(*it) << 6) & 0xfff;
169 if (is_trail(*(++it))) {
173 std::advance(it, -3);
174 return INCOMPLETE_SEQUENCE;
178 std::advance(it, -2);
179 return INCOMPLETE_SEQUENCE;
184 return INCOMPLETE_SEQUENCE;
189 if (!is_code_point_valid(cp)) {
190 for (octet_difference_type i = 0; i < length - 1; ++i)
192 return INVALID_CODE_POINT;
200 std::advance(it, -(length-1));
201 return OVERLONG_SEQUENCE;
204 else if (cp < 0x800) {
206 std::advance(it, -(length-1));
207 return OVERLONG_SEQUENCE;
210 else if (cp < 0x10000) {
212 std::advance(it, -(length-1));
213 return OVERLONG_SEQUENCE;
221 template <
typename octet_iterator>
222 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
223 return validate_next(it, end, 0);
231 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
233 template <
typename octet_iterator>
234 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
236 octet_iterator result = start;
237 while (result != end) {
238 internal::utf_error err_code = internal::validate_next(result, end);
239 if (err_code != internal::OK)
245 template <
typename octet_iterator>
246 inline bool is_valid(octet_iterator start, octet_iterator end)
248 return (find_invalid(start, end) == end);
251 template <
typename octet_iterator>
252 inline bool is_bom (octet_iterator it)
255 (internal::mask8(*it++)) == bom[0] &&
256 (internal::mask8(*it++)) == bom[1] &&
257 (internal::mask8(*it)) == bom[2]
262 #endif // header guard