From f0fce39119a0c2eec031404a8d34e389714b1f88 Mon Sep 17 00:00:00 2001
From: ntrifunovic
Date: Sat, 7 Oct 2006 21:25:47 +0000
Subject: [PATCH] Implemented replace_invalid functionality
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@61 a809a056-fc17-0410-9590-b4f493f8b08e
---
v2_0/doc/utf8cpp.html | 43 +++++++++++++++++++++--
v2_0/source/utf8.h | 49 +++++++++++++++++++++++++--
v2_0/test_drivers/smoke_test/test.cpp | 9 +++++
3 files changed, 95 insertions(+), 6 deletions(-)
diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html
index d97edf4..e6ba254 100644
--- a/v2_0/doc/utf8cpp.html
+++ b/v2_0/doc/utf8cpp.html
@@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);
In case of an invalid UTF-8 seqence, a
utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
+end
does not point to the past-of-end of a UTF-8
seqence, a utf8::not_enough_room
exception is
thrown.
utf8::utf32to8
@@ -373,7 +373,7 @@ assert (utf32result.size() == 2);
In case of an invalid UTF-8 seqence, a
utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
+end
does not point to the past-of-end of a UTF-8
seqence, a utf8::not_enough_room
exception is
thrown.
utf8::find_invalid
@@ -421,12 +421,49 @@ assert (bvalid == false);
find_invalid(start, end) == end;
. You may want to use
it to make sure that a byte seqence is a valid UTF-8 string without
the need to know where it fails if it is not valid.
+utf8::replace_invalid
+Replaces all invalid UTF-8 sequences within a string with a replacement marker.
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+
+
+start
: an iterator pointing to the beginning of the
+UTF-8 string to look for invalid UTF-8 sequences.
+end
: an iterator pointing to pass-the-end of the UTF-8
+string to look for invalid UTF-8 sequences.
+out
: An output iterator to the range where the result of replacement is stored.
+replacement
: A Unicode code point for the replacement marker. The version without this
+parameter assumes the value 0xfffd
+Return value: An iterator pointing to the place after the UTF-8 string with
+replaced invalid sequences.
+Example of use:
+
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+
+bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+assert (bvalid);
+char* fixed_invalid_sequence = "a????z";
+assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
+
+
+replace_invalid
does not perform in-place replacement of invalid sequences. Rather, it produces a copy
+of the original string with the invalid sequences replaced with a replacement marker. Therefore, out
must
+not be in the [start, end]
range.
+
+If end
does not point to the past-of-end of a UTF-8 sequence, a utf8::not_enough_room
+exception is thrown.
utf8::is_bom
Checks whether a sequence of three octets is a UTF-8 byte order
mark (BOM)
template <typename octet_iterator> bool is_bom
(octet_iterator it);
-it
Beginning of the 3-octet sequence to check
+
it
: beginning of the 3-octet sequence to check
Return value: true
if the sequence is UTF-8
byte order mark; false
if not.
Example of use:
diff --git a/v2_0/source/utf8.h b/v2_0/source/utf8.h
index 286670c..51543bc 100644
--- a/v2_0/source/utf8.h
+++ b/v2_0/source/utf8.h
@@ -136,7 +136,7 @@ namespace internal
enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
template
- utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0)
+ utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
{
uint32_t cp = mask8(*it);
// Check the lead octet
@@ -249,6 +249,11 @@ namespace internal
return OK;
}
+ template
+ inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+ return validate_next(it, end, 0);
+ }
+
} // namespace internal
/// The library API - functions intended to be called by the users
@@ -256,6 +261,44 @@ namespace internal
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+ template
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+ {
+ while (start != end) {
+ octet_iterator sequence_start = start;
+ internal::utf_error err_code = internal::validate_next(start, end);
+ switch (err_code) {
+ case internal::OK :
+ for (octet_iterator it = sequence_start; it != start; ++it)
+ *out++ = *it;
+ break;
+ case internal::NOT_ENOUGH_ROOM:
+ throw not_enough_room();
+ case internal::INVALID_LEAD:
+ append (replacement, out);
+ ++start;
+ break;
+ case internal::INCOMPLETE_SEQUENCE:
+ case internal::OVERLONG_SEQUENCE:
+ case internal::INVALID_CODE_POINT:
+ append (replacement, out);
+ ++start;
+ // just one replacement mark for the sequence
+ while (internal::is_trail(*start) && start != end)
+ ++start;
+ break;
+ }
+ }
+ return out;
+ }
+
+ template
+ inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+ {
+ static const uint32_t replacement_marker = internal::mask16(0xfffd);
+ return replace_invalid(start, end, out, replacement_marker);
+ }
+
template
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
@@ -269,13 +312,13 @@ namespace internal
}
template
- bool is_valid(octet_iterator start, octet_iterator end)
+ inline bool is_valid(octet_iterator start, octet_iterator end)
{
return (find_invalid(start, end) == end);
}
template
- bool is_bom (octet_iterator it)
+ inline bool is_bom (octet_iterator it)
{
return (
(internal::mask8(*it++)) == bom[0] &&
diff --git a/v2_0/test_drivers/smoke_test/test.cpp b/v2_0/test_drivers/smoke_test/test.cpp
index a0a3696..e6243c5 100644
--- a/v2_0/test_drivers/smoke_test/test.cpp
+++ b/v2_0/test_drivers/smoke_test/test.cpp
@@ -121,6 +121,15 @@ int main()
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark);
assert (bbom == true);
+
+ //replace_invalid
+ char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+ vector replace_invalid_result;
+ replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+ bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+ assert (bvalid);
+ char* fixed_invalid_sequence = "a????z";
+ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
//////////////////////////////////////////////////////////
//// Unchecked variants