Add a new exception class and a new set of policies.

Implements append().
This is an intermediate commit.
This commit is contained in:
King_DuckZ 2014-06-30 21:54:23 +02:00
parent 9713f6125c
commit aae1711e99
5 changed files with 128 additions and 76 deletions

View file

@ -56,7 +56,7 @@ namespace utf8 {
return static_cast<uint8_t>(0xff & oc);
}
template<>
inline uint8_ mask8<uint8_t>(uint8_t oc) {
inline uint8_t mask8<uint8_t>(uint8_t oc) {
return oc;
}
@ -193,21 +193,6 @@ namespace utf8 {
octet_value_type faulty_part;
};
// Unicode constants
enum {
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
LEAD_SURROGATE_MIN = 0xd800u,
LEAD_SURROGATE_MAX = 0xdbffu,
TRAIL_SURROGATE_MIN = 0xdc00u,
TRAIL_SURROGATE_MAX = 0xdfffu,
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
// Maximum valid value for a Unicode code point
CODE_POINT_MAX = 0x0010ffffu
};
template <typename u16>
inline bool is_lead_surrogate(u16 cp) {
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
@ -220,12 +205,12 @@ namespace utf8 {
template <typename u16>
inline bool is_surrogate(u16 cp) {
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
return cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX;
}
template <typename u32>
inline bool is_code_point_valid(u32 cp) {
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
return cp <= CODE_POINT_MAX && not utf8::internal::is_surrogate(cp);
}
template <typename difference_type, typename octet_type>
@ -244,7 +229,7 @@ namespace utf8 {
}
template <typename octet_difference_type>
inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
if (cp < 0x80) {
if (length != 1)
return true;
@ -262,47 +247,45 @@ namespace utf8 {
}
} // namespace internal
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
//octet_iterator result = start;
//while (result != end) {
// utf8::internal::utf_error err_code = utf8::internal::validate_next(result);
// if (err_code != internal::UTF8_OK)
// return result;
//}
//return result;
//TODO: implement
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
utf8::internal::SequenceReader<octet_iterator> seq;
while (start != end) {
const octet_difference_type length = utf8::internal::sequence_length<octet_difference_type>(*start);
const utf8::code_point cp = seq.get_sequence(start, length);
if (seq.has_error())
return start;
++start;
}
return start;
}
template <typename octet_iterator>
inline bool is_valid(octet_iterator start, octet_iterator end)
bool is_valid(octet_iterator start, octet_iterator end)
{
return (utf8::find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
bool starts_with_bom (octet_iterator it, octet_iterator end) {
return (
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
it != end && utf8::internal::mask8(*it++) == BomByte0 &&
it != end && utf8::internal::mask8(*it++) == BomByte1 &&
it != end && utf8::internal::mask8(*it) == BomByte2
);
}
//Deprecated in release 2.3
template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{
bool is_bom (octet_iterator it) {
return (
(utf8::internal::mask8(*it++)) == bom[0] &&
(utf8::internal::mask8(*it++)) == bom[1] &&
(utf8::internal::mask8(*it)) == bom[2]
utf8::internal::mask8(*it++) == BomByte0 &&
utf8::internal::mask8(*it++) == BomByte1 &&
utf8::internal::mask8(*it) == BomByte2
);
}
} // namespace utf8

View file

@ -45,8 +45,8 @@ namespace utf8 {
template <typename C>
struct utf_policy_throw : private utf_policy_replace<C> {
enum { is_safe = 1 };
uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
throw utf8::exception<C>(cp, ty, value, faulty_pos);
C operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
throw utf8::utf_exception<C>(cp, ty, value, faulty_pos);
return utf_policy_replace<C>::operator()(value);
}
};
@ -76,12 +76,34 @@ namespace utf8 {
struct utf_policy_default : public utf_policy_throw<typename std::iterator_traits<I>::value_type> {
};
//namespace internal {
// template <typename octet_iterator>
// class utf_validation {
// public:
// void operator() ( octet_iterator it ) const;
//} //namespace internal
/// Error policy to protect against generically invalid codepoints, not tied to any specific utf encoding
struct error_policy_ignore {
code_point operator() ( code_point cp, ErrorTypes ) const {
return cp;
}
};
struct error_policy_assert {
code_point operator() ( code_point cp, ErrorTypes ) const {
assert(false);
return cp;
}
};
template <code_point R>
struct error_policy_replace {
code_point operator() ( code_point, ErrorTypes ) const {
return R;
}
};
struct error_policy_throw {
code_point operator() ( code_point cp, ErrorTypes typ ) const {
throw code_point_exception(cp, typ);
}
};
typedef error_policy_throw error_policy_default;
} //namespace utf8
#endif

View file

@ -60,7 +60,7 @@ namespace utf8 {
};
template <typename C>
class exception : public ::std::exception {
class utf_exception : public ::std::exception {
public:
enum UtfCategories {
UtfCategory_8 = 1,
@ -68,7 +68,7 @@ namespace utf8 {
UtfCategory_32
};
exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
utf_exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
cp(codep),
utf_category(static_cast<UtfCategories>(internal::GetMinBitSizeClass<sizeof(C)>::Result)),
error_type(err_type),
@ -102,6 +102,28 @@ namespace utf8 {
const C faulty_value;
const uint8_t faulty_part;
};
class code_point_exception : public ::std::exception {
public:
code_point_exception ( code_point codep, ErrorTypes err_type) :
cp(codep),
error_type(err_type)
{
}
virtual const char* what ( void ) const noexcept {
switch (cp) {
case ErrorType_InvalidLead: return "Invalid lead";
case ErrorType_OverlongSequence: return "Overlong sequence";
case ErrorType_InvalidCodePoint: return "Invalid codepoint";
default: return "Unknown error";
}
}
private:
const code_point cp;
const ErrorTypes error_type;
};
} //namespace utf8
#endif

View file

@ -35,39 +35,39 @@ DEALINGS IN THE SOFTWARE.
#include "global.hpp"
#include <iterator>
namespace utf8
{
namespace utf8 {
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(utf8::code_point cp, octet_iterator result)
{
//TODO: implement
//if (!utf8::internal::is_code_point_valid(cp))
// throw invalid_code_point(cp);
template <typename octet_iterator, typename error_policy=error_policy_default>
octet_iterator append(utf8::code_point cp, octet_iterator result) {
if (not utf8::internal::is_code_point_valid(cp)) {
error_policy()(cp, ErrorType_InvalidCodePoint);
return result;
}
//if (cp < 0x80) // one octet
// *(result++) = static_cast<uint8_t>(cp);
//else if (cp < 0x800) { // two octets
// *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
//}
//else if (cp < 0x10000) { // three octets
// *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
//}
//else { // four octets
// *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
// *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
//}
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement)
{
assert(false);
//TODO: implement
//while (start != end) {
// octet_iterator sequence_start = start;
@ -170,6 +170,7 @@ namespace utf8
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
assert(false);
//TODO: implement
//while (start != end) {
// utf8::code_point cp = utf8::internal::mask16(*start++);

View file

@ -35,6 +35,30 @@ namespace utf8 {
enum {
InvalidCodePoint = static_cast<code_point>(-1)
};
// Byte order mark
enum {
BomByte0 = 0xef,
BomByte1 = 0xbb,
BomByte2 = 0xbf
};
namespace internal {
// Unicode constants
enum {
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
LEAD_SURROGATE_MIN = 0xd800u,
LEAD_SURROGATE_MAX = 0xdbffu,
TRAIL_SURROGATE_MIN = 0xdc00u,
TRAIL_SURROGATE_MAX = 0xdfffu,
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
// Maximum valid value for a Unicode code point
CODE_POINT_MAX = 0x0010ffffu
};
} //namespace internal
} //namespace utf8
#endif