Implemented replace_invalid functionality
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@61 a809a056-fc17-0410-9590-b4f493f8b08e Conflicts: v2_0/doc/utf8cpp.html v2_0/source/utf8.h v2_0/test_drivers/smoke_test/test.cpp
This commit is contained in:
parent
196a58c77e
commit
8671171bee
3 changed files with 95 additions and 6 deletions
|
@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);
|
|||
</pre>
|
||||
<p>In case of an invalid UTF-8 seqence, a
|
||||
<code>utf8::invalid_utf8</code> exception is thrown. If
|
||||
<code>last</code> does not point to the past-of-end of a UTF-8
|
||||
<code>end</code> does not point to the past-of-end of a UTF-8
|
||||
seqence, a <code>utf8::not_enough_room</code> exception is
|
||||
thrown.</p>
|
||||
<h4>utf8::utf32to8</h4>
|
||||
|
@ -373,7 +373,7 @@ assert (utf32result.size() == 2);
|
|||
</pre>
|
||||
<p>In case of an invalid UTF-8 seqence, a
|
||||
<code>utf8::invalid_utf8</code> exception is thrown. If
|
||||
<code>last</code> does not point to the past-of-end of a UTF-8
|
||||
<code>end</code> does not point to the past-of-end of a UTF-8
|
||||
seqence, a <code>utf8::not_enough_room</code> exception is
|
||||
thrown.</p>
|
||||
<h4>utf8::find_invalid</h4>
|
||||
|
@ -421,12 +421,49 @@ assert (bvalid == false);
|
|||
<code>find_invalid(start, end) == end;</code>. You may want to use
|
||||
it to make sure that a byte seqence is a valid UTF-8 string without
|
||||
the need to know where it fails if it is not valid.</p>
|
||||
<h4>utf8::replace_invalid</h4>
|
||||
<p>Replaces all invalid UTF-8 sequences within a string with a replacement marker.</p>
|
||||
<p><code>template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
|
||||
</code></p>
|
||||
<p><code>template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
|
||||
</code></p>
|
||||
<p>
|
||||
<code>start</code>: an iterator pointing to the beginning of the
|
||||
UTF-8 string to look for invalid UTF-8 sequences.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
|
||||
string to look for invalid UTF-8 sequences.<br>
|
||||
<code>out</code>: An output iterator to the range where the result of replacement is stored.<br>
|
||||
<code>replacement</code>: A Unicode code point for the replacement marker. The version without this
|
||||
parameter assumes the value <code>0xfffd</code><br>
|
||||
<u>Return value</u>: An iterator pointing to the place after the UTF-8 string with
|
||||
replaced invalid sequences.</p>
|
||||
<p>Example of use:</p>
|
||||
<pre>
|
||||
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||
vector<char> replace_invalid_result;
|
||||
|
||||
replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
|
||||
|
||||
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
|
||||
assert (bvalid);
|
||||
char* fixed_invalid_sequence = "a????z";
|
||||
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
|
||||
</pre>
|
||||
<p>
|
||||
<code>replace_invalid</code> does not perform in-place replacement of invalid sequences. Rather, it produces a copy
|
||||
of the original string with the invalid sequences replaced with a replacement marker. Therefore, <code>out</code> must
|
||||
not be in the <code>[start, end]</code> range.
|
||||
</p>
|
||||
<p>If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a <code>utf8::not_enough_room</code>
|
||||
exception is thrown.</p>
|
||||
<h4>utf8::is_bom</h4>
|
||||
<p>Checks whether a sequence of three octets is a UTF-8 byte order
|
||||
mark (BOM)</p>
|
||||
<code>template <typename octet_iterator> bool is_bom
|
||||
(octet_iterator it);</code>
|
||||
<p><code>it</code> Beginning of the 3-octet sequence to check<br>
|
||||
<p><code>it</code>: beginning of the 3-octet sequence to check<br>
|
||||
<u>Return value</u>: <code>true</code> if the sequence is UTF-8
|
||||
byte order mark; <code>false</code> if not.</p>
|
||||
<p>Example of use:</p>
|
||||
|
|
|
@ -136,7 +136,7 @@ namespace internal
|
|||
enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||
|
||||
template <typename octet_iterator>
|
||||
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0)
|
||||
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
uint32_t cp = mask8(*it);
|
||||
// Check the lead octet
|
||||
|
@ -249,6 +249,11 @@ namespace internal
|
|||
return OK;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
|
||||
return validate_next(it, end, 0);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// The library API - functions intended to be called by the users
|
||||
|
@ -256,6 +261,44 @@ namespace internal
|
|||
// Byte order mark
|
||||
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
|
||||
{
|
||||
while (start != end) {
|
||||
octet_iterator sequence_start = start;
|
||||
internal::utf_error err_code = internal::validate_next(start, end);
|
||||
switch (err_code) {
|
||||
case internal::OK :
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM:
|
||||
throw not_enough_room();
|
||||
case internal::INVALID_LEAD:
|
||||
append (replacement, out);
|
||||
++start;
|
||||
break;
|
||||
case internal::INCOMPLETE_SEQUENCE:
|
||||
case internal::OVERLONG_SEQUENCE:
|
||||
case internal::INVALID_CODE_POINT:
|
||||
append (replacement, out);
|
||||
++start;
|
||||
// just one replacement mark for the sequence
|
||||
while (internal::is_trail(*start) && start != end)
|
||||
++start;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
|
||||
{
|
||||
static const uint32_t replacement_marker = internal::mask16(0xfffd);
|
||||
return replace_invalid(start, end, out, replacement_marker);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
|
@ -269,13 +312,13 @@ namespace internal
|
|||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
bool is_valid(octet_iterator start, octet_iterator end)
|
||||
inline bool is_valid(octet_iterator start, octet_iterator end)
|
||||
{
|
||||
return (find_invalid(start, end) == end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
bool is_bom (octet_iterator it)
|
||||
inline bool is_bom (octet_iterator it)
|
||||
{
|
||||
return (
|
||||
(internal::mask8(*it++)) == bom[0] &&
|
||||
|
|
|
@ -121,6 +121,15 @@ int main()
|
|||
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
|
||||
bool bbom = is_bom(byte_order_mark);
|
||||
assert (bbom == true);
|
||||
|
||||
//replace_invalid
|
||||
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||
vector<char> replace_invalid_result;
|
||||
replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
|
||||
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
|
||||
assert (bvalid);
|
||||
char* fixed_invalid_sequence = "a????z";
|
||||
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
//// Unchecked variants
|
||||
|
|
Loading…
Reference in a new issue