diff --git a/src/utf8/core.hpp b/src/utf8/core.hpp index 02b114d..4ae5eed 100644 --- a/src/utf8/core.hpp +++ b/src/utf8/core.hpp @@ -1,4 +1,5 @@ // Copyright 2006 Nemanja Trifunovic +// Copyright 2014 Michele Santullo /* Permission is hereby granted, free of charge, to any person or organization @@ -28,283 +29,282 @@ DEALINGS IN THE SOFTWARE. #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#include "error_policies.hpp" +#include "global.hpp" #include -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; +#if defined(__GNUC__) +# define pure_function __attribute__((pure)) +#else +# error "Unknown compiler - if your compiler doesn't support pure functions just declare pure_function as an empty macro for your specific compiler" +#endif -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; +namespace utf8 { + // Helper code - not intended to be directly called by the library users. May be changed at any time + namespace internal { + template bool is_trail(octet_type oc) pure_function; + template uint8_t mask8(octet_type oc) pure_function; + template inline uint16_t mask16(u16_type oc) pure_function; - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; + template + bool is_trail(octet_type oc) { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } + template + inline uint8_t mask8(octet_type oc) { + return static_cast(0xff & oc); + } + template<> + inline uint8_ mask8(uint8_t oc) { + return oc; + } - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } + template + uint16_t mask16(u16_type oc) { + return static_cast(0xffff & oc); + } - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } + /// Helper for get_sequence_x + template + class SequenceReader { + public: + enum SequenceErrors { + SequenceError_None, + SequenceError_BadLength, + SequenceError_BadByte2, + SequenceError_BadByte3, + SequenceError_BadByte4, + SequenceError_NothingRead + }; - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } + typedef typename std::iterator_traits::difference_type octet_difference_type; + typedef typename std::iterator_traits::value_type octet_value_type; - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } + SequenceReader ( void ) : + seq_error(SequenceError_NothingRead) + { + } - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } + utf8::code_point get_sequence(octet_iterator& it, octet_difference_type seq_len) { + switch (seq_len) { + case 1: return get_sequence_1(it); + case 2: return get_sequence_2(it); + case 3: return get_sequence_3(it); + case 4: return get_sequence_4(it); + //this shouldn't really happen, this is an internal function + //and a correct length should be passed always + default: + seq_error = SequenceError_BadLength; + return cp; + } + } - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } + /// get_sequence_x functions decode utf-8 sequences of the length x + utf8::code_point get_sequence_1(const octet_iterator& it) { + seq_error = SequenceError_None; + return (cp = utf8::internal::mask8(*it)); + } - return false; - } + utf8::code_point get_sequence_2(octet_iterator& it) { + cp = utf8::internal::mask8(*it); - enum utf_error {UTF8_OK, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + ++it; + if (not utf8::internal::is_trail(*it)) { + faulty_part = *it; + seq_error = SequenceError_BadByte2; + return cp; + } - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it) - { - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; + seq_error = SequenceError_None; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + return cp; + } - return UTF8_OK; - } + utf8::code_point get_sequence_3(octet_iterator& it) { + cp = utf8::internal::mask8(*it); - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT) {utf_error ret = increase_safely(IT); if (ret != UTF8_OK) return ret;} + ++it; + if (not utf8::internal::is_trail(*it)) { + seq_error = SequenceError_BadByte2; + faulty_part = *it; + return cp; + } - /// get_sequence_x functions decode utf-8 sequences of the length x - template - void get_sequence_1(octet_iterator& it, uint32_t& code_point) - { - code_point = utf8::internal::mask8(*it); - } + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - template - utf_error get_sequence_2(octet_iterator& it, uint32_t& code_point) - { - code_point = utf8::internal::mask8(*it); + ++it; + if (not utf8::internal::is_trail(*it)) { + faulty_part = *it; + seq_error = SequenceError_BadByte3; + return cp; + } + seq_error = SequenceError_None; + cp += *it & 0x3f; + return cp; + } - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + utf8::code_point get_sequence_4(octet_iterator& it) { + utf8::code_point cp = utf8::internal::mask8(*it); - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + ++it; + if (not utf8::internal::is_trail(*it)) { + seq_error = SequenceError_BadByte2; + faulty_part = *it; + return cp; + } - return UTF8_OK; - } + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - template - utf_error get_sequence_3(octet_iterator& it, uint32_t& code_point) - { - code_point = utf8::internal::mask8(*it); + ++it; + if (not utf8::internal::is_trail(*it)) { + faulty_part = *it; + seq_error = SequenceError_BadByte3; + return cp; + } - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + if (not utf8::internal::is_trail(*it)) { + faulty_part = *it; + seq_error = SequenceError_BadByte4; + return cp; + } + seq_error = SequenceError_None; + cp += *it & 0x3f; + return cp; + } - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + bool has_error ( void ) const { return seq_error != SequenceError_None; } + SequenceErrors error ( void ) const { return seq_error; } + utf8::code_point code ( void ) const { return cp; } + octet_value_type faulty ( void ) const { return faulty_part; } + uint8_t part ( void ) const { + if (SequenceError_BadByte2 == seq_error or SequenceError_BadByte3 == seq_error or SequenceError_BadByte4 == seq_error) + return static_cast(seq_error - SequenceError_BadByte2) + 2; + else + return 0; + } - code_point += (*it) & 0x3f; + private: + utf8::code_point cp; + SequenceErrors seq_error; + octet_value_type faulty_part; + }; - return UTF8_OK; - } + // Unicode constants + enum { + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + LEAD_SURROGATE_MIN = 0xd800u, + LEAD_SURROGATE_MAX = 0xdbffu, + TRAIL_SURROGATE_MIN = 0xdc00u, + TRAIL_SURROGATE_MAX = 0xdfffu, + LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10), + SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN, - template - utf_error get_sequence_4(octet_iterator& it, uint32_t& code_point) - { - code_point = utf8::internal::mask8(*it); + // Maximum valid value for a Unicode code point + CODE_POINT_MAX = 0x0010ffffu + }; - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + template + inline bool is_lead_surrogate(u16 cp) { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + template + inline bool is_trail_surrogate(u16 cp) { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + template + inline bool is_surrogate(u16 cp) { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + template + inline bool is_code_point_valid(u32 cp) { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it) + template + inline difference_type sequence_length(octet_type lead_char) { + const uint8_t lead = utf8::internal::mask8(lead_char); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } - code_point += (*it) & 0x3f; + template + inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } - return UTF8_OK; - } + return false; + } + } // namespace internal - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - template - utf_error validate_next(octet_iterator& it, uint32_t& code_point) - { - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + //octet_iterator result = start; + //while (result != end) { + // utf8::internal::utf_error err_code = utf8::internal::validate_next(result); + // if (err_code != internal::UTF8_OK) + // return result; + //} + //return result; + //TODO: implement + return start; + } - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - utf8::internal::get_sequence_1(it, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, cp); - break; - } + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it) { - uint32_t ignored; - return utf8::internal::validate_next(it, ignored); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } - - //Deprecated in release 2.3 - template - inline bool is_bom (octet_iterator it) - { - return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] - ); - } + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } } // namespace utf8 #endif // header guard diff --git a/src/utf8/error_policies.hpp b/src/utf8/error_policies.hpp new file mode 100644 index 0000000..407c90e --- /dev/null +++ b/src/utf8/error_policies.hpp @@ -0,0 +1,87 @@ +// Copyright 2014 Michele Santullo + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +#ifndef idF5D925E65E0C4C2A983DAC728A7D5B77 +#define idF5D925E65E0C4C2A983DAC728A7D5B77 + +#include "exception.hpp" +#include "global.hpp" +#include + +namespace utf8 { + /// Error policy to protect against out-of-bounds iterators. + + /// Error policy to protect agains invalid utf sequences + template + struct utf_policy_replace { + C operator() ( C value ) const { + return static_cast('?'); + } + }; + + template + struct utf_policy_throw : private utf_policy_replace { + enum { is_safe = 1 }; + uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const { + throw utf8::exception(cp, ty, value, faulty_pos); + return utf_policy_replace::operator()(value); + } + }; + + template + struct utf_policy_assert : private utf_policy_replace { +#if defined(NDEBUG) + enum { is_safe = 0 }; +#else + enum { is_safe = 1 }; +#endif + C operator() ( C value ) const { + assert(false); + return utf_policy_replace::operator()(value); + } + }; + + template + struct utf_policy_ignore { + enum { is_safe = 0 }; + C operator() ( C value ) const { + return value; + } + }; + + template + struct utf_policy_default : public utf_policy_throw::value_type> { + }; + + //namespace internal { + // template + // class utf_validation { + // public: + // void operator() ( octet_iterator it ) const; + //} //namespace internal +} //namespace utf8 + +#endif diff --git a/src/utf8/exception.hpp b/src/utf8/exception.hpp index 39b9d28..5d1d82b 100644 --- a/src/utf8/exception.hpp +++ b/src/utf8/exception.hpp @@ -1,4 +1,5 @@ // Copyright 2006 Nemanja Trifunovic +// Copyright 2014 Michele Santullo /* Permission is hereby granted, free of charge, to any person or organization @@ -28,40 +29,78 @@ DEALINGS IN THE SOFTWARE. #define id71B1E0983F3D4F7BAD0C091C4569AB37 #include +#include "global.hpp" namespace utf8 { - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; + namespace internal { + template + class GetMinBitSizeClass { + private: + GetMinBitSizeClass ( void ); //Not implemented + + template + struct CalcImpl { + enum { Result = 0 }; + }; + template + struct CalcImpl { + enum { Result = 1 + CalcImpl<(Val>>1)>::Result }; + }; + public: + enum { Result = CalcImpl::Result }; + }; + } //namespace internal // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} + enum ErrorTypes { + ErrorType_InvalidLead, + ErrorType_IncompleteSequence, + ErrorType_OverlongSequence, + ErrorType_InvalidCodePoint }; - class invalid_utf8 : public exception { - uint8_t u8; + template + class exception : public ::std::exception { public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; + enum UtfCategories { + UtfCategory_8 = 1, + UtfCategory_16, + UtfCategory_32 + }; - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; + exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) : + cp(codep), + utf_category(static_cast(internal::GetMinBitSizeClass::Result)), + error_type(err_type), + faulty_value(value), + faulty_part(faulty_pos) + { + static_assert( + static_cast(internal::GetMinBitSizeClass::Result) == UtfCategory_8 or + static_cast(internal::GetMinBitSizeClass::Result) == UtfCategory_16 or + static_cast(internal::GetMinBitSizeClass::Result) == UtfCategory_32, "Invalid size for template parameter"); + } + virtual const char* what() const noexcept { + switch (error_type) { + case ErrorType_IncompleteSequence: + switch (utf_category) { + case UtfCategory_32: return "Invalid code point"; + case UtfCategory_8: return "Invalid UTF-8"; + case UtfCategory_16: return "Invalid UTF-16"; + default: return "Error in unknown sequence type"; + } + case ErrorType_InvalidLead: return "Invalid lead"; + case ErrorType_OverlongSequence: return "Overlong sequence"; + case ErrorType_InvalidCodePoint: return "Invalid codepoint"; + default: return "Unknown error"; + } + } - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } + const utf8::code_point cp; + const UtfCategories utf_category; + const ErrorTypes error_type; + const C faulty_value; + const uint8_t faulty_part; }; } //namespace utf8 diff --git a/src/utf8/functions.hpp b/src/utf8/functions.hpp index 045f422..23152b6 100644 --- a/src/utf8/functions.hpp +++ b/src/utf8/functions.hpp @@ -31,98 +31,117 @@ DEALINGS IN THE SOFTWARE. #include "core.hpp" #include "exception.hpp" +#include "error_policies.hpp" +#include "global.hpp" +#include namespace utf8 { /// The library API - functions intended to be called by the users template - octet_iterator append(uint32_t cp, octet_iterator result) + octet_iterator append(utf8::code_point cp, octet_iterator result) { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); + //TODO: implement + //if (!utf8::internal::is_code_point_valid(cp)) + // throw invalid_code_point(cp); - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } + //if (cp < 0x80) // one octet + // *(result++) = static_cast(cp); + //else if (cp < 0x800) { // two octets + // *(result++) = static_cast((cp >> 6) | 0xc0); + // *(result++) = static_cast((cp & 0x3f) | 0x80); + //} + //else if (cp < 0x10000) { // three octets + // *(result++) = static_cast((cp >> 12) | 0xe0); + // *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + // *(result++) = static_cast((cp & 0x3f) | 0x80); + //} + //else { // four octets + // *(result++) = static_cast((cp >> 18) | 0xf0); + // *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + // *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + // *(result++) = static_cast((cp & 0x3f) | 0x80); + //} return result; } template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement) { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } + //TODO: implement + //while (start != end) { + // octet_iterator sequence_start = start; + // internal::utf_error err_code = utf8::internal::validate_next(start); + // switch (err_code) { + // case internal::UTF8_OK : + // for (octet_iterator it = sequence_start; it != start; ++it) + // *out++ = *it; + // break; + // case internal::INVALID_LEAD: + // out = utf8::append (replacement, out); + // ++start; + // break; + // case internal::INCOMPLETE_SEQUENCE: + // case internal::OVERLONG_SEQUENCE: + // case internal::INVALID_CODE_POINT: + // out = utf8::append (replacement, out); + // ++start; + // // just one replacement mark for the sequence + // while (start != end && utf8::internal::is_trail(*start)) + // ++start; + // break; + // } + //} return out; } template inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + static const utf8::code_point replacement_marker = utf8::internal::mask16(0xfffd); return utf8::replace_invalid(start, end, out, replacement_marker); } - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); + template > + utf8::code_point next(octet_iterator& it) { + typedef typename std::iterator_traits::difference_type octet_difference_type; + typedef typename std::iterator_traits::value_type octet_value_type; + + // Determine the sequence length based on the lead octet + const octet_value_type lead_char(*it); + const octet_difference_type length = utf8::internal::sequence_length(lead_char); + + // Get trail octets and calculate the code point + utf8::internal::SequenceReader seq; + const utf8::code_point cp = seq.get_sequence(it, length); + if (seq.has_error()) { + return invalid_utf_policy()(seq.code(), seq.faulty(), utf8::ErrorType_InvalidCodePoint, seq.part()); + } + + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (not utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + ++it; + return cp; + } + else { + return invalid_utf_policy()(cp, 0, utf8::ErrorType_OverlongSequence, 0); + } + } + else { + return invalid_utf_policy()(cp, 0, utf8::ErrorType_InvalidCodePoint, 0); } - return cp; } template - uint32_t peek_next(octet_iterator it) + utf8::code_point peek_next(octet_iterator it) { return utf8::next(it); } template - uint32_t prior(octet_iterator& it) + utf8::code_point prior(octet_iterator& it) { octet_iterator end = it; // Go back until we hit either a lead octet or start @@ -130,11 +149,12 @@ namespace utf8 return utf8::peek_next(it); } - template - void advance (octet_iterator& it, distance_type n) - { + template > + code_point advance (octet_iterator& it, distance_type n) { + code_point ret = InvalidCodePoint; for (distance_type i = 0; i < n; ++i) - utf8::next(it); + ret = utf8::next(it); + return ret; } template @@ -150,27 +170,28 @@ namespace utf8 template octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); + //TODO: implement + //while (start != end) { + // utf8::code_point cp = utf8::internal::mask16(*start++); + // // Take care of surrogate pairs first + // if (utf8::internal::is_lead_surrogate(cp)) { + // if (start != end) { + // utf8::code_point trail_surrogate = utf8::internal::mask16(*start++); + // if (utf8::internal::is_trail_surrogate(trail_surrogate)) + // cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + // else + // throw invalid_utf16(static_cast(trail_surrogate)); + // } + // else + // throw invalid_utf16(static_cast(cp)); - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); + // } + // // Lone trail surrogate + // else if (utf8::internal::is_trail_surrogate(cp)) + // throw invalid_utf16(static_cast(cp)); - result = utf8::append(cp, result); - } + // result = utf8::append(cp, result); + //} return result; } @@ -178,7 +199,7 @@ namespace utf8 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) { while (start != end) { - uint32_t cp = utf8::next(start); + utf8::code_point cp = utf8::next(start); if (cp > 0xffff) { //make a surrogate pair *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); diff --git a/src/utf8/global.hpp b/src/utf8/global.hpp new file mode 100644 index 0000000..dcca73e --- /dev/null +++ b/src/utf8/global.hpp @@ -0,0 +1,40 @@ +// Copyright 2014 Michele Santullo + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +#ifndef idACBCC4F2EA7148C2B3F22FBCA60E3F21 +#define idACBCC4F2EA7148C2B3F22FBCA60E3F21 + +#include +#include + +namespace utf8 { + typedef uint32_t code_point; + enum { + InvalidCodePoint = static_cast(-1) + }; +} //namespace utf8 + +#endif diff --git a/src/utf8/iterator.hpp b/src/utf8/iterator.hpp index b51b2cb..2b253bf 100644 --- a/src/utf8/iterator.hpp +++ b/src/utf8/iterator.hpp @@ -31,102 +31,13 @@ DEALINGS IN THE SOFTWARE. #include namespace utf8 { - // Error policies for the iterator class - template - class range_policy_throw { - public: - range_policy_throw ( const range_policy_throw& ) = delete; - range_policy_throw ( void ) = delete; - range_policy_throw ( range_policy_throw&& ) = delete; - range_policy_throw& operator= ( const range_policy_throw& ) = delete; - - range_policy_throw ( const I& range_start, const I& range_end ) : - m_range_start(range_start), - m_range_end(range_end) - { - } - - void operator() ( const I& it ) const { - if (it < m_range_start || it > m_range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - void operator() ( const I& range_start, const I& range_end ) { - if (m_range_start != range_start || m_range_end != range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - } - - private: - I m_range_start; - I m_range_end; - }; -// template -// class range_policy_assert { -// public: -// range_policy_assert ( -// static void check_in_range(const I& it, const I& range_start, const I& range_end) -// { -//#if defined(NDEBUG) -// (void)it; -// (void)range_start; -// (void)range_end; -//#else -// assert(it >= range_start && it <= range_end); -//#endif -// } -// static void check_same_range(const I& range_start_a, const I& range_start_b, const I& range_end_a, const I& range_end_b) -// { -//#if defined(NDEBUG) -// (void)range_start_a; -// (void)range_start_b; -// (void)range_end_a; -// (void)range_end_b; -//#else -// assert(range_start_a == range_start_b && range_end_a == range_end_b); -//#endif -// } -// }; - - template - struct utf_policy_replace { - C operator() ( C value ) const { - return static_cast('?'); - } - }; - - template struct utf_policy_throw; - template <> struct utf_policy_throw : private utf_policy_replace { - char operator() ( uint8_t value ) const { - throw utf8::invalid_utf8(value); - return utf_policy_replace::operator()(value); - } - }; - template <> struct utf_policy_throw : private utf_policy_replace { - char operator() ( uint16_t value ) const { - throw utf8::invalid_utf16(value); - return utf_policy_replace::operator()(value); - } - }; - template <> struct utf_policy_throw : private utf_policy_replace { - char operator() ( uint32_t value ) const { - throw utf8::invalid_code_point(value); - return utf_policy_replace::operator()(value); - } - }; - - template - struct utf_policy_assert : private utf_policy_replace { - char operator() ( uint8_t value ) const { - assert(false); - return utf_policy_replace::operator()(value); - } - }; - /// The iterator class + template < typename octet_iterator, - typename utf_error_policy=utf_policy_throw::value_type> + typename utf_error_policy=utf_policy_default > - class iterator : public std::iterator ::iterator_category, uint32_t> { + class iterator : public std::iterator ::iterator_category, typename std::iterator_traits::value_type, typename std::iterator_traits::difference_type, typename std::iterator_traits::pointer, typename std::iterator_traits::reference> { public: typedef typename std::iterator_traits::difference_type difference_type; @@ -144,43 +55,43 @@ namespace utf8 { ~iterator ( void ) noexcept(noexcept(std::declval().~octet_iterator())) { } octet_iterator base () const { return m_it; } - uint32_t operator* () const { - octet_iterator temp = m_it; - return utf8::next(temp); + const uint32_t& operator* () const { + return m_codepoint; } bool operator== (const iterator& rhs) const { - return (m_it == rhs.m_it); + return m_codepoint == rhs.m_codepoint && m_it == rhs.m_it; } bool operator!= (const iterator& rhs) const { return !(operator== (rhs)); } iterator& operator++ () { - utf8::next(m_it); + m_codepoint = utf8::next(m_it); return *this; } iterator operator++ (int) { iterator temp = *this; - utf8::next(m_it); + m_codepoint = utf8::next(m_it); return temp; } iterator& operator-- () { - utf8::prior(m_it); + m_codepoint = utf8::prior(m_it); return *this; } iterator operator-- (int) { iterator temp = *this; - utf8::prior(m_it); + m_codepoint = utf8::prior(m_it); return temp; } difference_type operator- (const iterator& rhs) const { return m_it - rhs.m_it; } iterator& operator+= (difference_type inc) { - utf8::advance(m_it, inc); + m_codepoint = utf8::advance(m_it, inc); } public: octet_iterator m_it; + uint32_t m_codepoint; }; // class iterator } // namespace utf8