diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index d97edf4..e6ba254 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 +end does not point to the past-of-end of a UTF-8 seqence, a utf8::not_enough_room exception is thrown.

utf8::utf32to8

@@ -373,7 +373,7 @@ assert (utf32result.size() == 2);

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 +end does not point to the past-of-end of a UTF-8 seqence, a utf8::not_enough_room exception is thrown.

utf8::find_invalid

@@ -421,12 +421,49 @@ assert (bvalid == false); find_invalid(start, end) == end;. You may want to use it to make sure that a byte seqence is a valid UTF-8 string without the need to know where it fails if it is not valid.

+

utf8::replace_invalid

+

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

+

template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +

+

template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +

+

+start: an iterator pointing to the beginning of the +UTF-8 string to look for invalid UTF-8 sequences.
+end: an iterator pointing to pass-the-end of the UTF-8 +string to look for invalid UTF-8 sequences.
+out: An output iterator to the range where the result of replacement is stored.
+replacement: A Unicode code point for the replacement marker. The version without this +parameter assumes the value 0xfffd
+Return value: An iterator pointing to the place after the UTF-8 string with +replaced invalid sequences.

+

Example of use:

+
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+
+bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+assert (bvalid);
+char* fixed_invalid_sequence = "a????z";
+assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
+
+

+replace_invalid does not perform in-place replacement of invalid sequences. Rather, it produces a copy +of the original string with the invalid sequences replaced with a replacement marker. Therefore, out must +not be in the [start, end] range. +

+

If end does not point to the past-of-end of a UTF-8 sequence, a utf8::not_enough_room +exception is thrown.

utf8::is_bom

Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)

template <typename octet_iterator> bool is_bom (octet_iterator it); -

it Beginning of the 3-octet sequence to check
+

it: beginning of the 3-octet sequence to check
Return value: true if the sequence is UTF-8 byte order mark; false if not.

Example of use:

diff --git a/source/utf8.h b/source/utf8.h index 286670c..51543bc 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -136,7 +136,7 @@ namespace internal enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0) + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) { uint32_t cp = mask8(*it); // Check the lead octet @@ -249,6 +249,11 @@ namespace internal return OK; } + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + return validate_next(it, end, 0); + } + } // namespace internal /// The library API - functions intended to be called by the users @@ -256,6 +261,44 @@ namespace internal // Byte order mark const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = internal::validate_next(start, end); + switch (err_code) { + case internal::OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (internal::is_trail(*start) && start != end) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = internal::mask16(0xfffd); + return replace_invalid(start, end, out, replacement_marker); + } + template octet_iterator find_invalid(octet_iterator start, octet_iterator end) { @@ -269,13 +312,13 @@ namespace internal } template - bool is_valid(octet_iterator start, octet_iterator end) + inline bool is_valid(octet_iterator start, octet_iterator end) { return (find_invalid(start, end) == end); } template - bool is_bom (octet_iterator it) + inline bool is_bom (octet_iterator it) { return ( (internal::mask8(*it++)) == bom[0] && diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp index a0a3696..e6243c5 100644 --- a/test_drivers/smoke_test/test.cpp +++ b/test_drivers/smoke_test/test.cpp @@ -121,6 +121,15 @@ int main() unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; bool bbom = is_bom(byte_order_mark); assert (bbom == true); + + //replace_invalid + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); + bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + assert (bvalid); + char* fixed_invalid_sequence = "a????z"; + assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); ////////////////////////////////////////////////////////// //// Unchecked variants