From aae1711e997864159c041de921e2ca390b28ae7c Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Mon, 30 Jun 2014 21:54:23 +0200 Subject: [PATCH] Add a new exception class and a new set of policies. Implements append(). This is an intermediate commit. --- src/utf8/core.hpp | 65 ++++++++++++++----------------------- src/utf8/error_policies.hpp | 38 +++++++++++++++++----- src/utf8/exception.hpp | 26 +++++++++++++-- src/utf8/functions.hpp | 51 +++++++++++++++-------------- src/utf8/global.hpp | 24 ++++++++++++++ 5 files changed, 128 insertions(+), 76 deletions(-) diff --git a/src/utf8/core.hpp b/src/utf8/core.hpp index 4ae5eed..73505f0 100644 --- a/src/utf8/core.hpp +++ b/src/utf8/core.hpp @@ -56,7 +56,7 @@ namespace utf8 { return static_cast(0xff & oc); } template<> - inline uint8_ mask8(uint8_t oc) { + inline uint8_t mask8(uint8_t oc) { return oc; } @@ -193,21 +193,6 @@ namespace utf8 { octet_value_type faulty_part; }; - // Unicode constants - enum { - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - LEAD_SURROGATE_MIN = 0xd800u, - LEAD_SURROGATE_MAX = 0xdbffu, - TRAIL_SURROGATE_MIN = 0xdc00u, - TRAIL_SURROGATE_MAX = 0xdfffu, - LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10), - SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN, - - // Maximum valid value for a Unicode code point - CODE_POINT_MAX = 0x0010ffffu - }; - template inline bool is_lead_surrogate(u16 cp) { return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); @@ -220,12 +205,12 @@ namespace utf8 { template inline bool is_surrogate(u16 cp) { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + return cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX; } template inline bool is_code_point_valid(u32 cp) { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + return cp <= CODE_POINT_MAX && not utf8::internal::is_surrogate(cp); } template @@ -244,7 +229,7 @@ namespace utf8 { } template - inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) { + bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) { if (cp < 0x80) { if (length != 1) return true; @@ -262,47 +247,45 @@ namespace utf8 { } } // namespace internal - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - template octet_iterator find_invalid(octet_iterator start, octet_iterator end) { - //octet_iterator result = start; - //while (result != end) { - // utf8::internal::utf_error err_code = utf8::internal::validate_next(result); - // if (err_code != internal::UTF8_OK) - // return result; - //} - //return result; - //TODO: implement + typedef typename std::iterator_traits::difference_type octet_difference_type; + utf8::internal::SequenceReader seq; + + while (start != end) { + const octet_difference_type length = utf8::internal::sequence_length(*start); + const utf8::code_point cp = seq.get_sequence(start, length); + if (seq.has_error()) + return start; + ++start; + } + return start; } template - inline bool is_valid(octet_iterator start, octet_iterator end) + bool is_valid(octet_iterator start, octet_iterator end) { return (utf8::find_invalid(start, end) == end); } template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { + bool starts_with_bom (octet_iterator it, octet_iterator end) { return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + it != end && utf8::internal::mask8(*it++) == BomByte0 && + it != end && utf8::internal::mask8(*it++) == BomByte1 && + it != end && utf8::internal::mask8(*it) == BomByte2 ); } //Deprecated in release 2.3 template - inline bool is_bom (octet_iterator it) - { + bool is_bom (octet_iterator it) { return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] + utf8::internal::mask8(*it++) == BomByte0 && + utf8::internal::mask8(*it++) == BomByte1 && + utf8::internal::mask8(*it) == BomByte2 ); } } // namespace utf8 diff --git a/src/utf8/error_policies.hpp b/src/utf8/error_policies.hpp index 407c90e..0bb11e2 100644 --- a/src/utf8/error_policies.hpp +++ b/src/utf8/error_policies.hpp @@ -45,8 +45,8 @@ namespace utf8 { template struct utf_policy_throw : private utf_policy_replace { enum { is_safe = 1 }; - uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const { - throw utf8::exception(cp, ty, value, faulty_pos); + C operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const { + throw utf8::utf_exception(cp, ty, value, faulty_pos); return utf_policy_replace::operator()(value); } }; @@ -76,12 +76,34 @@ namespace utf8 { struct utf_policy_default : public utf_policy_throw::value_type> { }; - //namespace internal { - // template - // class utf_validation { - // public: - // void operator() ( octet_iterator it ) const; - //} //namespace internal + /// Error policy to protect against generically invalid codepoints, not tied to any specific utf encoding + struct error_policy_ignore { + code_point operator() ( code_point cp, ErrorTypes ) const { + return cp; + } + }; + + struct error_policy_assert { + code_point operator() ( code_point cp, ErrorTypes ) const { + assert(false); + return cp; + } + }; + + template + struct error_policy_replace { + code_point operator() ( code_point, ErrorTypes ) const { + return R; + } + }; + + struct error_policy_throw { + code_point operator() ( code_point cp, ErrorTypes typ ) const { + throw code_point_exception(cp, typ); + } + }; + + typedef error_policy_throw error_policy_default; } //namespace utf8 #endif diff --git a/src/utf8/exception.hpp b/src/utf8/exception.hpp index 5d1d82b..8bf43d8 100644 --- a/src/utf8/exception.hpp +++ b/src/utf8/exception.hpp @@ -60,7 +60,7 @@ namespace utf8 { }; template - class exception : public ::std::exception { + class utf_exception : public ::std::exception { public: enum UtfCategories { UtfCategory_8 = 1, @@ -68,7 +68,7 @@ namespace utf8 { UtfCategory_32 }; - exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) : + utf_exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) : cp(codep), utf_category(static_cast(internal::GetMinBitSizeClass::Result)), error_type(err_type), @@ -102,6 +102,28 @@ namespace utf8 { const C faulty_value; const uint8_t faulty_part; }; + + class code_point_exception : public ::std::exception { + public: + code_point_exception ( code_point codep, ErrorTypes err_type) : + cp(codep), + error_type(err_type) + { + } + + virtual const char* what ( void ) const noexcept { + switch (cp) { + case ErrorType_InvalidLead: return "Invalid lead"; + case ErrorType_OverlongSequence: return "Overlong sequence"; + case ErrorType_InvalidCodePoint: return "Invalid codepoint"; + default: return "Unknown error"; + } + } + + private: + const code_point cp; + const ErrorTypes error_type; + }; } //namespace utf8 #endif diff --git a/src/utf8/functions.hpp b/src/utf8/functions.hpp index 23152b6..760aa43 100644 --- a/src/utf8/functions.hpp +++ b/src/utf8/functions.hpp @@ -35,39 +35,39 @@ DEALINGS IN THE SOFTWARE. #include "global.hpp" #include -namespace utf8 -{ +namespace utf8 { /// The library API - functions intended to be called by the users - template - octet_iterator append(utf8::code_point cp, octet_iterator result) - { - //TODO: implement - //if (!utf8::internal::is_code_point_valid(cp)) - // throw invalid_code_point(cp); + template + octet_iterator append(utf8::code_point cp, octet_iterator result) { + if (not utf8::internal::is_code_point_valid(cp)) { + error_policy()(cp, ErrorType_InvalidCodePoint); + return result; + } - //if (cp < 0x80) // one octet - // *(result++) = static_cast(cp); - //else if (cp < 0x800) { // two octets - // *(result++) = static_cast((cp >> 6) | 0xc0); - // *(result++) = static_cast((cp & 0x3f) | 0x80); - //} - //else if (cp < 0x10000) { // three octets - // *(result++) = static_cast((cp >> 12) | 0xe0); - // *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - // *(result++) = static_cast((cp & 0x3f) | 0x80); - //} - //else { // four octets - // *(result++) = static_cast((cp >> 18) | 0xf0); - // *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - // *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - // *(result++) = static_cast((cp & 0x3f) | 0x80); - //} + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } return result; } template output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement) { + assert(false); //TODO: implement //while (start != end) { // octet_iterator sequence_start = start; @@ -170,6 +170,7 @@ namespace utf8 template octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) { + assert(false); //TODO: implement //while (start != end) { // utf8::code_point cp = utf8::internal::mask16(*start++); diff --git a/src/utf8/global.hpp b/src/utf8/global.hpp index dcca73e..05d053a 100644 --- a/src/utf8/global.hpp +++ b/src/utf8/global.hpp @@ -35,6 +35,30 @@ namespace utf8 { enum { InvalidCodePoint = static_cast(-1) }; + + // Byte order mark + enum { + BomByte0 = 0xef, + BomByte1 = 0xbb, + BomByte2 = 0xbf + }; + + namespace internal { + // Unicode constants + enum { + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + LEAD_SURROGATE_MIN = 0xd800u, + LEAD_SURROGATE_MAX = 0xdbffu, + TRAIL_SURROGATE_MIN = 0xdc00u, + TRAIL_SURROGATE_MAX = 0xdfffu, + LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10), + SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN, + + // Maximum valid value for a Unicode code point + CODE_POINT_MAX = 0x0010ffffu + }; + } //namespace internal } //namespace utf8 #endif