Add a new exception class and a new set of policies.

Implements append(). This is an intermediate commit.
2014-06-30 21:54:23 +02:00 · 2014-06-30 21:54:23 +02:00 · aae1711e99
commit aae1711e99
parent 9713f6125c
5 changed files with 128 additions and 76 deletions
--- a/src/utf8/core.hpp
+++ b/src/utf8/core.hpp
@ -56,7 +56,7 @@ namespace utf8 {
 			return static_cast<uint8_t>(0xff & oc);
 		}
 		template<>
-		inline uint8_ mask8<uint8_t>(uint8_t oc) {
+		inline uint8_t mask8<uint8_t>(uint8_t oc) {
 			return oc;
 		}

@ -193,21 +193,6 @@ namespace utf8 {
 			octet_value_type faulty_part;
 		};

-		// Unicode constants
-		enum {
-			// Leading (high) surrogates: 0xd800 - 0xdbff
-			// Trailing (low) surrogates: 0xdc00 - 0xdfff
-			LEAD_SURROGATE_MIN  = 0xd800u,
-			LEAD_SURROGATE_MAX  = 0xdbffu,
-			TRAIL_SURROGATE_MIN = 0xdc00u,
-			TRAIL_SURROGATE_MAX = 0xdfffu,
-			LEAD_OFFSET		 = LEAD_SURROGATE_MIN - (0x10000 >> 10),
-			SURROGATE_OFFSET	= 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
-
-			// Maximum valid value for a Unicode code point
-			CODE_POINT_MAX	  = 0x0010ffffu
-		};
-
 		template <typename u16>
 		inline bool is_lead_surrogate(u16 cp) {
 			return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
@ -220,12 +205,12 @@ namespace utf8 {

 		template <typename u16>
 		inline bool is_surrogate(u16 cp) {
-			return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+			return cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX;
 		}

 		template <typename u32>
 		inline bool is_code_point_valid(u32 cp) {
-			return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+			return cp <= CODE_POINT_MAX && not utf8::internal::is_surrogate(cp);
 		}

 		template <typename difference_type, typename octet_type>
@ -244,7 +229,7 @@ namespace utf8 {
 		}

 		template <typename octet_difference_type>
-		inline bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
+		bool is_overlong_sequence(utf8::code_point cp, octet_difference_type length) {
 			if (cp < 0x80) {
 				if (length != 1)
 					return true;
@ -262,47 +247,45 @@ namespace utf8 {
 		}
 	} // namespace internal

-	// Byte order mark
-	const uint8_t bom[] = {0xef, 0xbb, 0xbf};
-
 	template <typename octet_iterator>
 	octet_iterator find_invalid(octet_iterator start, octet_iterator end)
 	{
-		//octet_iterator result = start;
-		//while (result != end) {
-		//	utf8::internal::utf_error err_code = utf8::internal::validate_next(result);
-		//	if (err_code != internal::UTF8_OK)
-		//		return result;
-		//}
-		//return result;
-		//TODO: implement
+        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
+		utf8::internal::SequenceReader<octet_iterator> seq;
+
+		while (start != end) {
+			const octet_difference_type length = utf8::internal::sequence_length<octet_difference_type>(*start);
+			const utf8::code_point cp = seq.get_sequence(start, length);
+			if (seq.has_error())
+				return start;
+			++start;
+		}
+
 		return start;
 	}

 	template <typename octet_iterator>
-	inline bool is_valid(octet_iterator start, octet_iterator end)
+	bool is_valid(octet_iterator start, octet_iterator end)
 	{
 		return (utf8::find_invalid(start, end) == end);
 	}

 	template <typename octet_iterator>
-	inline bool starts_with_bom (octet_iterator it, octet_iterator end)
-	{
+	bool starts_with_bom (octet_iterator it, octet_iterator end) {
 		return (
-			((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
-			((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
-			((it != end) && (utf8::internal::mask8(*it))   == bom[2])
+			it != end && utf8::internal::mask8(*it++) == BomByte0 &&
+			it != end && utf8::internal::mask8(*it++) == BomByte1 &&
+			it != end && utf8::internal::mask8(*it)   == BomByte2
 		   );
 	}

 	//Deprecated in release 2.3
 	template <typename octet_iterator>
-	inline bool is_bom (octet_iterator it)
-	{
+	bool is_bom (octet_iterator it) {
 		return (
-			(utf8::internal::mask8(*it++)) == bom[0] &&
-			(utf8::internal::mask8(*it++)) == bom[1] &&
-			(utf8::internal::mask8(*it))   == bom[2]
+			utf8::internal::mask8(*it++) == BomByte0 &&
+			utf8::internal::mask8(*it++) == BomByte1 &&
+			utf8::internal::mask8(*it)   == BomByte2
 		   );
 	}
 } // namespace utf8
--- a/src/utf8/error_policies.hpp
+++ b/src/utf8/error_policies.hpp
@ -45,8 +45,8 @@ namespace utf8 {
 	template <typename C>
 	struct utf_policy_throw : private utf_policy_replace<C> {
 		enum { is_safe = 1 };
-		uint8_t operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
-			throw utf8::exception<C>(cp, ty, value, faulty_pos);
+		C operator() ( utf8::code_point cp, C value, utf8::ErrorTypes ty, uint8_t faulty_pos ) const {
+			throw utf8::utf_exception<C>(cp, ty, value, faulty_pos);
 			return utf_policy_replace<C>::operator()(value);
 		}
 	};
@ -76,12 +76,34 @@ namespace utf8 {
 	struct utf_policy_default : public utf_policy_throw<typename std::iterator_traits<I>::value_type> {
 	};

-	//namespace internal {
-	//	template <typename octet_iterator>
-	//	class utf_validation {
-	//	public:
-	//		void operator() ( octet_iterator it ) const;
-	//} //namespace internal
+	/// Error policy to protect against generically invalid codepoints, not tied to any specific utf encoding
+	struct error_policy_ignore {
+		code_point operator() ( code_point cp, ErrorTypes ) const {
+			return cp;
+		}
+	};
+
+	struct error_policy_assert {
+		code_point operator() ( code_point cp, ErrorTypes ) const {
+			assert(false);
+			return cp;
+		}
+	};
+
+	template <code_point R>
+	struct error_policy_replace {
+		code_point operator() ( code_point, ErrorTypes ) const {
+			return R;
+		}
+	};
+
+	struct error_policy_throw {
+		code_point operator() ( code_point cp, ErrorTypes typ ) const {
+			throw code_point_exception(cp, typ);
+		}
+	};
+
+	typedef error_policy_throw error_policy_default;
 } //namespace utf8

 #endif
--- a/src/utf8/exception.hpp
+++ b/src/utf8/exception.hpp
@ -60,7 +60,7 @@ namespace utf8 {
 	};

 	template <typename C>
-	class exception : public ::std::exception {
+	class utf_exception : public ::std::exception {
 	public:
 		enum UtfCategories {
 			UtfCategory_8 = 1,
@ -68,7 +68,7 @@ namespace utf8 {
 			UtfCategory_32
 		};

-		exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
+		utf_exception ( utf8::code_point codep, ErrorTypes err_type, C value, uint8_t faulty_pos ) :
 			cp(codep),
 			utf_category(static_cast<UtfCategories>(internal::GetMinBitSizeClass<sizeof(C)>::Result)),
 			error_type(err_type),
@ -102,6 +102,28 @@ namespace utf8 {
 		const C faulty_value;
 		const uint8_t faulty_part;
 	};
+
+	class code_point_exception : public ::std::exception {
+	public:
+		code_point_exception ( code_point codep, ErrorTypes err_type) :
+			cp(codep),
+			error_type(err_type)
+		{
+		}
+
+		virtual const char* what ( void ) const noexcept {
+			switch (cp) {
+			case ErrorType_InvalidLead: return "Invalid lead";
+			case ErrorType_OverlongSequence: return "Overlong sequence";
+			case ErrorType_InvalidCodePoint: return "Invalid codepoint";
+			default: return "Unknown error";
+			}
+		}
+
+	private:
+		const code_point cp;
+		const ErrorTypes error_type;
+	};
 } //namespace utf8

 #endif
--- a/src/utf8/functions.hpp
+++ b/src/utf8/functions.hpp
@ -35,39 +35,39 @@ DEALINGS IN THE SOFTWARE.
 #include "global.hpp"
 #include <iterator>

-namespace utf8
-{
+namespace utf8 {
    /// The library API - functions intended to be called by the users
-    template <typename octet_iterator>
-    octet_iterator append(utf8::code_point cp, octet_iterator result)
-    {
-		//TODO: implement
-        //if (!utf8::internal::is_code_point_valid(cp))
-        //    throw invalid_code_point(cp);
+    template <typename octet_iterator, typename error_policy=error_policy_default>
+    octet_iterator append(utf8::code_point cp, octet_iterator result) {
+        if (not utf8::internal::is_code_point_valid(cp)) {
+			error_policy()(cp, ErrorType_InvalidCodePoint);
+            return result;
+		}

-        //if (cp < 0x80)                        // one octet
-        //    *(result++) = static_cast<uint8_t>(cp);
-        //else if (cp < 0x800) {                // two octets
-        //    *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
-        //    *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        //}
-        //else if (cp < 0x10000) {              // three octets
-        //    *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
-        //    *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
-        //    *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        //}
-        //else {                                // four octets
-        //    *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
-        //    *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
-        //    *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
-        //    *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
-        //}
+        if (cp < 0x80)                        // one octet
+            *(result++) = static_cast<uint8_t>(cp);
+        else if (cp < 0x800) {                // two octets
+            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else if (cp < 0x10000) {              // three octets
+            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else {                                // four octets
+            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
        return result;
    }

    template <typename octet_iterator, typename output_iterator>
    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utf8::code_point replacement)
    {
+		assert(false);
 		//TODO: implement
        //while (start != end) {
        //    octet_iterator sequence_start = start;
@ -170,6 +170,7 @@ namespace utf8
    template <typename u16bit_iterator, typename octet_iterator>
    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
    {
+		assert(false);
 		//TODO: implement
        //while (start != end) {
        //    utf8::code_point cp = utf8::internal::mask16(*start++);
--- a/src/utf8/global.hpp
+++ b/src/utf8/global.hpp
@ -35,6 +35,30 @@ namespace utf8 {
 	enum {
 		InvalidCodePoint = static_cast<code_point>(-1)
 	};
+
+	// Byte order mark
+	enum {
+		BomByte0 = 0xef,
+		BomByte1 = 0xbb,
+		BomByte2 = 0xbf
+	};
+
+	namespace internal {
+		// Unicode constants
+		enum {
+			// Leading (high) surrogates: 0xd800 - 0xdbff
+			// Trailing (low) surrogates: 0xdc00 - 0xdfff
+			LEAD_SURROGATE_MIN  = 0xd800u,
+			LEAD_SURROGATE_MAX  = 0xdbffu,
+			TRAIL_SURROGATE_MIN = 0xdc00u,
+			TRAIL_SURROGATE_MAX = 0xdfffu,
+			LEAD_OFFSET		 = LEAD_SURROGATE_MIN - (0x10000 >> 10),
+			SURROGATE_OFFSET	= 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN,
+
+			// Maximum valid value for a Unicode code point
+			CODE_POINT_MAX	  = 0x0010ffffu
+		};
+	} //namespace internal
 } //namespace utf8

 #endif