Add a new exception class and a new set of policies.
Implements append(). This is an intermediate commit.
This commit is contained in:
parent
9713f6125c
commit
aae1711e99
5 changed files with 128 additions and 76 deletions
|
@ -56,7 +56,7 @@ namespace utf8 {
|
|||
return static_cast<uint8_t>(0xff & oc);
|
||||
}
|
||||
template<>
|
||||
inline uint8_ mask8<uint8_t>(uint8_t oc) {
|
||||
inline uint8_t mask8<uint8_t>(uint8_t oc) {
|
||||
return oc;
|
||||
}
|
||||
|
||||
|
@ -193,21 +193,6 @@ namespace utf8 {
|
|||
octet_value_type faulty_part;
|
||||
};
|
||||
|
||||
// Unicode constants
|
||||
enum {
|
||||
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||
LEAD_SURROGATE_MIN = 0xd800u,
|
||||
LEAD_SURROGATE_MAX = 0xdbffu,
|
||||
TRAIL_SURROGATE_MIN = 0xdc00u,
|
||||
TRAIL_SURROGATE_MAX = 0xdfffu,
|
||||
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
|
||||
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
|
||||
|
||||
// Maximum valid value for a Unicode code point
|
||||
CODE_POINT_MAX = 0x0010ffffu
|
||||
};
|
||||
|
||||
template <typename u16>
|
||||
inline bool is_lead_surrogate(u16 cp) {
|
||||
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
|
||||
|
@ -220,12 +205,12 @@ namespace utf8 {
|
|||
|
||||
template <typename u16>
|
||||
inline bool is_surrogate(u16 cp) {
|
||||
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||
return cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX;
|
||||
}
|
||||
|
||||
template <typename u32>
|
||||
inline bool is_code_point_valid(u32 cp) {
|
||||
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
|
||||
return cp <= CODE_POINT_MAX && not utf8::internal::is_surrogate(cp);
|
||||
}
|
||||
|
||||
template <typename difference_type, typename octet_type>
|
||||
|
@ -244,7 +229,7 @@ namespace utf8 {
|
|||
}
|
||||
|
||||
template <typename octet_difference_type>
|
||||
inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
|
||||
bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
|
||||
if (cp < 0x80) {
|
||||
if (length != 1)
|
||||
return true;
|
||||
|
@ -262,47 +247,45 @@ namespace utf8 {
|
|||
}
|
||||
} // namespace internal
|
||||
|
||||
// Byte order mark
|
||||
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
//octet_iterator result = start;
|
||||
//while (result != end) {
|
||||
// utf8::internal::utf_error err_code = utf8::internal::validate_next(result);
|
||||
// if (err_code != internal::UTF8_OK)
|
||||
// return result;
|
||||
//}
|
||||
//return result;
|
||||
//TODO: implement
|
||||
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||
utf8::internal::SequenceReader<octet_iterator> seq;
|
||||
|
||||
while (start != end) {
|
||||
const octet_difference_type length = utf8::internal::sequence_length<octet_difference_type>(*start);
|
||||
const utf8::code_point cp = seq.get_sequence(start, length);
|
||||
if (seq.has_error())
|
||||
return start;
|
||||
++start;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool is_valid(octet_iterator start, octet_iterator end)
|
||||
bool is_valid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
return (utf8::find_invalid(start, end) == end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
|
||||
{
|
||||
bool starts_with_bom (octet_iterator it, octet_iterator end) {
|
||||
return (
|
||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
|
||||
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
|
||||
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
|
||||
it != end && utf8::internal::mask8(*it++) == BomByte0 &&
|
||||
it != end && utf8::internal::mask8(*it++) == BomByte1 &&
|
||||
it != end && utf8::internal::mask8(*it) == BomByte2
|
||||
);
|
||||
}
|
||||
|
||||
//Deprecated in release 2.3
|
||||
template <typename octet_iterator>
|
||||
inline bool is_bom (octet_iterator it)
|
||||
{
|
||||
bool is_bom (octet_iterator it) {
|
||||
return (
|
||||
(utf8::internal::mask8(*it++)) == bom[0] &&
|
||||
(utf8::internal::mask8(*it++)) == bom[1] &&
|
||||
(utf8::internal::mask8(*it)) == bom[2]
|
||||
utf8::internal::mask8(*it++) == BomByte0 &&
|
||||
utf8::internal::mask8(*it++) == BomByte1 &&
|
||||
utf8::internal::mask8(*it) == BomByte2
|
||||
);
|
||||
}
|
||||
} // namespace utf8
|
||||
|
|
|
@ -45,8 +45,8 @@ namespace utf8 {
|
|||
template <typename C>
|
||||
struct utf_policy_throw : private utf_policy_replace<C> {
|
||||
enum { is_safe = 1 };
|
||||
uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
|
||||
throw utf8::exception<C>(cp, ty, value, faulty_pos);
|
||||
C operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
|
||||
throw utf8::utf_exception<C>(cp, ty, value, faulty_pos);
|
||||
return utf_policy_replace<C>::operator()(value);
|
||||
}
|
||||
};
|
||||
|
@ -76,12 +76,34 @@ namespace utf8 {
|
|||
struct utf_policy_default : public utf_policy_throw<typename std::iterator_traits<I>::value_type> {
|
||||
};
|
||||
|
||||
//namespace internal {
|
||||
// template <typename octet_iterator>
|
||||
// class utf_validation {
|
||||
// public:
|
||||
// void operator() ( octet_iterator it ) const;
|
||||
//} //namespace internal
|
||||
/// Error policy to protect against generically invalid codepoints, not tied to any specific utf encoding
|
||||
struct error_policy_ignore {
|
||||
code_point operator() ( code_point cp, ErrorTypes ) const {
|
||||
return cp;
|
||||
}
|
||||
};
|
||||
|
||||
struct error_policy_assert {
|
||||
code_point operator() ( code_point cp, ErrorTypes ) const {
|
||||
assert(false);
|
||||
return cp;
|
||||
}
|
||||
};
|
||||
|
||||
template <code_point R>
|
||||
struct error_policy_replace {
|
||||
code_point operator() ( code_point, ErrorTypes ) const {
|
||||
return R;
|
||||
}
|
||||
};
|
||||
|
||||
struct error_policy_throw {
|
||||
code_point operator() ( code_point cp, ErrorTypes typ ) const {
|
||||
throw code_point_exception(cp, typ);
|
||||
}
|
||||
};
|
||||
|
||||
typedef error_policy_throw error_policy_default;
|
||||
} //namespace utf8
|
||||
|
||||
#endif
|
||||
|
|
|
@ -60,7 +60,7 @@ namespace utf8 {
|
|||
};
|
||||
|
||||
template <typename C>
|
||||
class exception : public ::std::exception {
|
||||
class utf_exception : public ::std::exception {
|
||||
public:
|
||||
enum UtfCategories {
|
||||
UtfCategory_8 = 1,
|
||||
|
@ -68,7 +68,7 @@ namespace utf8 {
|
|||
UtfCategory_32
|
||||
};
|
||||
|
||||
exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
|
||||
utf_exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
|
||||
cp(codep),
|
||||
utf_category(static_cast<UtfCategories>(internal::GetMinBitSizeClass<sizeof(C)>::Result)),
|
||||
error_type(err_type),
|
||||
|
@ -102,6 +102,28 @@ namespace utf8 {
|
|||
const C faulty_value;
|
||||
const uint8_t faulty_part;
|
||||
};
|
||||
|
||||
class code_point_exception : public ::std::exception {
|
||||
public:
|
||||
code_point_exception ( code_point codep, ErrorTypes err_type) :
|
||||
cp(codep),
|
||||
error_type(err_type)
|
||||
{
|
||||
}
|
||||
|
||||
virtual const char* what ( void ) const noexcept {
|
||||
switch (cp) {
|
||||
case ErrorType_InvalidLead: return "Invalid lead";
|
||||
case ErrorType_OverlongSequence: return "Overlong sequence";
|
||||
case ErrorType_InvalidCodePoint: return "Invalid codepoint";
|
||||
default: return "Unknown error";
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const code_point cp;
|
||||
const ErrorTypes error_type;
|
||||
};
|
||||
} //namespace utf8
|
||||
|
||||
#endif
|
||||
|
|
|
@ -35,39 +35,39 @@ DEALINGS IN THE SOFTWARE.
|
|||
#include "global.hpp"
|
||||
#include <iterator>
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
namespace utf8 {
|
||||
/// The library API - functions intended to be called by the users
|
||||
template <typename octet_iterator>
|
||||
octet_iterator append(utf8::code_point cp, octet_iterator result)
|
||||
{
|
||||
//TODO: implement
|
||||
//if (!utf8::internal::is_code_point_valid(cp))
|
||||
// throw invalid_code_point(cp);
|
||||
template <typename octet_iterator, typename error_policy=error_policy_default>
|
||||
octet_iterator append(utf8::code_point cp, octet_iterator result) {
|
||||
if (not utf8::internal::is_code_point_valid(cp)) {
|
||||
error_policy()(cp, ErrorType_InvalidCodePoint);
|
||||
return result;
|
||||
}
|
||||
|
||||
//if (cp < 0x80) // one octet
|
||||
// *(result++) = static_cast<uint8_t>(cp);
|
||||
//else if (cp < 0x800) { // two octets
|
||||
// *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
//}
|
||||
//else if (cp < 0x10000) { // three octets
|
||||
// *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
//}
|
||||
//else { // four octets
|
||||
// *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
// *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||
// *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
// *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
//}
|
||||
if (cp < 0x80) // one octet
|
||||
*(result++) = static_cast<uint8_t>(cp);
|
||||
else if (cp < 0x800) { // two octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else if (cp < 0x10000) { // three octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
else { // four octets
|
||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement)
|
||||
{
|
||||
assert(false);
|
||||
//TODO: implement
|
||||
//while (start != end) {
|
||||
// octet_iterator sequence_start = start;
|
||||
|
@ -170,6 +170,7 @@ namespace utf8
|
|||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
assert(false);
|
||||
//TODO: implement
|
||||
//while (start != end) {
|
||||
// utf8::code_point cp = utf8::internal::mask16(*start++);
|
||||
|
|
|
@ -35,6 +35,30 @@ namespace utf8 {
|
|||
enum {
|
||||
InvalidCodePoint = static_cast<code_point>(-1)
|
||||
};
|
||||
|
||||
// Byte order mark
|
||||
enum {
|
||||
BomByte0 = 0xef,
|
||||
BomByte1 = 0xbb,
|
||||
BomByte2 = 0xbf
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
// Unicode constants
|
||||
enum {
|
||||
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||
LEAD_SURROGATE_MIN = 0xd800u,
|
||||
LEAD_SURROGATE_MAX = 0xdbffu,
|
||||
TRAIL_SURROGATE_MIN = 0xdc00u,
|
||||
TRAIL_SURROGATE_MAX = 0xdfffu,
|
||||
LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10),
|
||||
SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
|
||||
|
||||
// Maximum valid value for a Unicode code point
|
||||
CODE_POINT_MAX = 0x0010ffffu
|
||||
};
|
||||
} //namespace internal
|
||||
} //namespace utf8
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in a new issue