Add a new exception class and a new set of policies.
Implements append(). This is an intermediate commit.
This commit is contained in:
parent
9713f6125c
commit
aae1711e99
5 changed files with 128 additions and 76 deletions
|
@ -56,7 +56,7 @@ namespace utf8 {
|
||||||
return static_cast<uint8_t>(0xff & oc);
|
return static_cast<uint8_t>(0xff & oc);
|
||||||
}
|
}
|
||||||
template<>
|
template<>
|
||||||
inline uint8_ mask8<uint8_t>(uint8_t oc) {
|
inline uint8_t mask8<uint8_t>(uint8_t oc) {
|
||||||
return oc;
|
return oc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,21 +193,6 @@ namespace utf8 {
|
||||||
octet_value_type faulty_part;
|
octet_value_type faulty_part;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unicode constants
|
|
||||||
enum {
|
|
||||||
// Leading (high) surrogates: 0xd800 - 0xdbff
|
|
||||||
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
|
||||||
LEAD_SURROGATE_MIN = 0xd800u,
|
|
||||||
LEAD_SURROGATE_MAX = 0xdbffu,
|
|
||||||
TRAIL_SURROGATE_MIN = 0xdc00u,
|
|
||||||
TRAIL_SURROGATE_MAX = 0xdfffu,
|
|
||||||
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
|
|
||||||
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
|
|
||||||
|
|
||||||
// Maximum valid value for a Unicode code point
|
|
||||||
CODE_POINT_MAX = 0x0010ffffu
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename u16>
|
template <typename u16>
|
||||||
inline bool is_lead_surrogate(u16 cp) {
|
inline bool is_lead_surrogate(u16 cp) {
|
||||||
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
||||||
|
@ -220,12 +205,12 @@ namespace utf8 {
|
||||||
|
|
||||||
template <typename u16>
|
template <typename u16>
|
||||||
inline bool is_surrogate(u16 cp) {
|
inline bool is_surrogate(u16 cp) {
|
||||||
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
return cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename u32>
|
template <typename u32>
|
||||||
inline bool is_code_point_valid(u32 cp) {
|
inline bool is_code_point_valid(u32 cp) {
|
||||||
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
|
return cp <= CODE_POINT_MAX && not utf8::internal::is_surrogate(cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename difference_type, typename octet_type>
|
template <typename difference_type, typename octet_type>
|
||||||
|
@ -244,7 +229,7 @@ namespace utf8 {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_difference_type>
|
template <typename octet_difference_type>
|
||||||
inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
|
bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
|
||||||
if (cp < 0x80) {
|
if (cp < 0x80) {
|
||||||
if (length != 1)
|
if (length != 1)
|
||||||
return true;
|
return true;
|
||||||
|
@ -262,47 +247,45 @@ namespace utf8 {
|
||||||
}
|
}
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
|
||||||
// Byte order mark
|
|
||||||
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
|
|
||||||
|
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
||||||
{
|
{
|
||||||
//octet_iterator result = start;
|
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||||
//while (result != end) {
|
utf8::internal::SequenceReader<octet_iterator> seq;
|
||||||
// utf8::internal::utf_error err_code = utf8::internal::validate_next(result);
|
|
||||||
// if (err_code != internal::UTF8_OK)
|
while (start != end) {
|
||||||
// return result;
|
const octet_difference_type length = utf8::internal::sequence_length<octet_difference_type>(*start);
|
||||||
//}
|
const utf8::code_point cp = seq.get_sequence(start, length);
|
||||||
//return result;
|
if (seq.has_error())
|
||||||
//TODO: implement
|
return start;
|
||||||
|
++start;
|
||||||
|
}
|
||||||
|
|
||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
inline bool is_valid(octet_iterator start, octet_iterator end)
|
bool is_valid(octet_iterator start, octet_iterator end)
|
||||||
{
|
{
|
||||||
return (utf8::find_invalid(start, end) == end);
|
return (utf8::find_invalid(start, end) == end);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
|
bool starts_with_bom (octet_iterator it, octet_iterator end) {
|
||||||
{
|
|
||||||
return (
|
return (
|
||||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
|
it != end && utf8::internal::mask8(*it++) == BomByte0 &&
|
||||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
|
it != end && utf8::internal::mask8(*it++) == BomByte1 &&
|
||||||
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
|
it != end && utf8::internal::mask8(*it) == BomByte2
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Deprecated in release 2.3
|
//Deprecated in release 2.3
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
inline bool is_bom (octet_iterator it)
|
bool is_bom (octet_iterator it) {
|
||||||
{
|
|
||||||
return (
|
return (
|
||||||
(utf8::internal::mask8(*it++)) == bom[0] &&
|
utf8::internal::mask8(*it++) == BomByte0 &&
|
||||||
(utf8::internal::mask8(*it++)) == bom[1] &&
|
utf8::internal::mask8(*it++) == BomByte1 &&
|
||||||
(utf8::internal::mask8(*it)) == bom[2]
|
utf8::internal::mask8(*it) == BomByte2
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} // namespace utf8
|
} // namespace utf8
|
||||||
|
|
|
@ -45,8 +45,8 @@ namespace utf8 {
|
||||||
template <typename C>
|
template <typename C>
|
||||||
struct utf_policy_throw : private utf_policy_replace<C> {
|
struct utf_policy_throw : private utf_policy_replace<C> {
|
||||||
enum { is_safe = 1 };
|
enum { is_safe = 1 };
|
||||||
uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
|
C operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
|
||||||
throw utf8::exception<C>(cp, ty, value, faulty_pos);
|
throw utf8::utf_exception<C>(cp, ty, value, faulty_pos);
|
||||||
return utf_policy_replace<C>::operator()(value);
|
return utf_policy_replace<C>::operator()(value);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -76,12 +76,34 @@ namespace utf8 {
|
||||||
struct utf_policy_default : public utf_policy_throw<typename std::iterator_traits<I>::value_type> {
|
struct utf_policy_default : public utf_policy_throw<typename std::iterator_traits<I>::value_type> {
|
||||||
};
|
};
|
||||||
|
|
||||||
//namespace internal {
|
/// Error policy to protect against generically invalid codepoints, not tied to any specific utf encoding
|
||||||
// template <typename octet_iterator>
|
struct error_policy_ignore {
|
||||||
// class utf_validation {
|
code_point operator() ( code_point cp, ErrorTypes ) const {
|
||||||
// public:
|
return cp;
|
||||||
// void operator() ( octet_iterator it ) const;
|
}
|
||||||
//} //namespace internal
|
};
|
||||||
|
|
||||||
|
struct error_policy_assert {
|
||||||
|
code_point operator() ( code_point cp, ErrorTypes ) const {
|
||||||
|
assert(false);
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <code_point R>
|
||||||
|
struct error_policy_replace {
|
||||||
|
code_point operator() ( code_point, ErrorTypes ) const {
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct error_policy_throw {
|
||||||
|
code_point operator() ( code_point cp, ErrorTypes typ ) const {
|
||||||
|
throw code_point_exception(cp, typ);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef error_policy_throw error_policy_default;
|
||||||
} //namespace utf8
|
} //namespace utf8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -60,7 +60,7 @@ namespace utf8 {
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename C>
|
template <typename C>
|
||||||
class exception : public ::std::exception {
|
class utf_exception : public ::std::exception {
|
||||||
public:
|
public:
|
||||||
enum UtfCategories {
|
enum UtfCategories {
|
||||||
UtfCategory_8 = 1,
|
UtfCategory_8 = 1,
|
||||||
|
@ -68,7 +68,7 @@ namespace utf8 {
|
||||||
UtfCategory_32
|
UtfCategory_32
|
||||||
};
|
};
|
||||||
|
|
||||||
exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
|
utf_exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
|
||||||
cp(codep),
|
cp(codep),
|
||||||
utf_category(static_cast<UtfCategories>(internal::GetMinBitSizeClass<sizeof(C)>::Result)),
|
utf_category(static_cast<UtfCategories>(internal::GetMinBitSizeClass<sizeof(C)>::Result)),
|
||||||
error_type(err_type),
|
error_type(err_type),
|
||||||
|
@ -102,6 +102,28 @@ namespace utf8 {
|
||||||
const C faulty_value;
|
const C faulty_value;
|
||||||
const uint8_t faulty_part;
|
const uint8_t faulty_part;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class code_point_exception : public ::std::exception {
|
||||||
|
public:
|
||||||
|
code_point_exception ( code_point codep, ErrorTypes err_type) :
|
||||||
|
cp(codep),
|
||||||
|
error_type(err_type)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual const char* what ( void ) const noexcept {
|
||||||
|
switch (cp) {
|
||||||
|
case ErrorType_InvalidLead: return "Invalid lead";
|
||||||
|
case ErrorType_OverlongSequence: return "Overlong sequence";
|
||||||
|
case ErrorType_InvalidCodePoint: return "Invalid codepoint";
|
||||||
|
default: return "Unknown error";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const code_point cp;
|
||||||
|
const ErrorTypes error_type;
|
||||||
|
};
|
||||||
} //namespace utf8
|
} //namespace utf8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -35,39 +35,39 @@ DEALINGS IN THE SOFTWARE.
|
||||||
#include "global.hpp"
|
#include "global.hpp"
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
|
||||||
namespace utf8
|
namespace utf8 {
|
||||||
{
|
|
||||||
/// The library API - functions intended to be called by the users
|
/// The library API - functions intended to be called by the users
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator, typename error_policy=error_policy_default>
|
||||||
octet_iterator append(utf8::code_point cp, octet_iterator result)
|
octet_iterator append(utf8::code_point cp, octet_iterator result) {
|
||||||
{
|
if (not utf8::internal::is_code_point_valid(cp)) {
|
||||||
//TODO: implement
|
error_policy()(cp, ErrorType_InvalidCodePoint);
|
||||||
//if (!utf8::internal::is_code_point_valid(cp))
|
return result;
|
||||||
// throw invalid_code_point(cp);
|
}
|
||||||
|
|
||||||
//if (cp < 0x80) // one octet
|
if (cp < 0x80) // one octet
|
||||||
// *(result++) = static_cast<uint8_t>(cp);
|
*(result++) = static_cast<uint8_t>(cp);
|
||||||
//else if (cp < 0x800) { // two octets
|
else if (cp < 0x800) { // two octets
|
||||||
// *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
//}
|
}
|
||||||
//else if (cp < 0x10000) { // three octets
|
else if (cp < 0x10000) { // three octets
|
||||||
// *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||||
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
//}
|
}
|
||||||
//else { // four octets
|
else { // four octets
|
||||||
// *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||||
// *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||||
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
//}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_iterator, typename output_iterator>
|
template <typename octet_iterator, typename output_iterator>
|
||||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement)
|
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement)
|
||||||
{
|
{
|
||||||
|
assert(false);
|
||||||
//TODO: implement
|
//TODO: implement
|
||||||
//while (start != end) {
|
//while (start != end) {
|
||||||
// octet_iterator sequence_start = start;
|
// octet_iterator sequence_start = start;
|
||||||
|
@ -170,6 +170,7 @@ namespace utf8
|
||||||
template <typename u16bit_iterator, typename octet_iterator>
|
template <typename u16bit_iterator, typename octet_iterator>
|
||||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||||
{
|
{
|
||||||
|
assert(false);
|
||||||
//TODO: implement
|
//TODO: implement
|
||||||
//while (start != end) {
|
//while (start != end) {
|
||||||
// utf8::code_point cp = utf8::internal::mask16(*start++);
|
// utf8::code_point cp = utf8::internal::mask16(*start++);
|
||||||
|
|
|
@ -35,6 +35,30 @@ namespace utf8 {
|
||||||
enum {
|
enum {
|
||||||
InvalidCodePoint = static_cast<code_point>(-1)
|
InvalidCodePoint = static_cast<code_point>(-1)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Byte order mark
|
||||||
|
enum {
|
||||||
|
BomByte0 = 0xef,
|
||||||
|
BomByte1 = 0xbb,
|
||||||
|
BomByte2 = 0xbf
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
// Unicode constants
|
||||||
|
enum {
|
||||||
|
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||||
|
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||||
|
LEAD_SURROGATE_MIN = 0xd800u,
|
||||||
|
LEAD_SURROGATE_MAX = 0xdbffu,
|
||||||
|
TRAIL_SURROGATE_MIN = 0xdc00u,
|
||||||
|
TRAIL_SURROGATE_MAX = 0xdfffu,
|
||||||
|
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
|
||||||
|
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
|
||||||
|
|
||||||
|
// Maximum valid value for a Unicode code point
|
||||||
|
CODE_POINT_MAX = 0x0010ffffu
|
||||||
|
};
|
||||||
|
} //namespace internal
|
||||||
} //namespace utf8
|
} //namespace utf8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue