diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index d97edf4..e6ba254 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);
In case of an invalid UTF-8 seqence, a
utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
+end
does not point to the past-of-end of a UTF-8
seqence, a utf8::not_enough_room
exception is
thrown.
In case of an invalid UTF-8 seqence, a
utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
+end
does not point to the past-of-end of a UTF-8
seqence, a utf8::not_enough_room
exception is
thrown.
find_invalid(start, end) == end;
. You may want to use
it to make sure that a byte seqence is a valid UTF-8 string without
the need to know where it fails if it is not valid.
+Replaces all invalid UTF-8 sequences within a string with a replacement marker.
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+
template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+
+start
: an iterator pointing to the beginning of the
+UTF-8 string to look for invalid UTF-8 sequences.
+end
: an iterator pointing to pass-the-end of the UTF-8
+string to look for invalid UTF-8 sequences.
+out
: An output iterator to the range where the result of replacement is stored.
+replacement
: A Unicode code point for the replacement marker. The version without this
+parameter assumes the value 0xfffd
+Return value: An iterator pointing to the place after the UTF-8 string with
+replaced invalid sequences.
Example of use:
++char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; + +replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); + +bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); ++
+replace_invalid
does not perform in-place replacement of invalid sequences. Rather, it produces a copy
+of the original string with the invalid sequences replaced with a replacement marker. Therefore, out
must
+not be in the [start, end]
range.
+
If end
does not point to the past-of-end of a UTF-8 sequence, a utf8::not_enough_room
+exception is thrown.
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
template <typename octet_iterator> bool is_bom
(octet_iterator it);
-it
Beginning of the 3-octet sequence to check
+
it
: beginning of the 3-octet sequence to check
Return value: true
if the sequence is UTF-8
byte order mark; false
if not.
Example of use:
diff --git a/source/utf8.h b/source/utf8.h index 286670c..51543bc 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -136,7 +136,7 @@ namespace internal enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; template