Implemented replace_invalid functionality

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@61 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2006-10-07 21:25:47 +00:00
parent 8af502d493
commit f0fce39119
3 changed files with 95 additions and 6 deletions

View file

@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);
</pre>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown. If
<code>last</code> does not point to the past-of-end of a UTF-8
<code>end</code> does not point to the past-of-end of a UTF-8
seqence, a <code>utf8::not_enough_room</code> exception is
thrown.</p>
<h4>utf8::utf32to8</h4>
@ -373,7 +373,7 @@ assert (utf32result.size() == 2);
</pre>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown. If
<code>last</code> does not point to the past-of-end of a UTF-8
<code>end</code> does not point to the past-of-end of a UTF-8
seqence, a <code>utf8::not_enough_room</code> exception is
thrown.</p>
<h4>utf8::find_invalid</h4>
@ -421,12 +421,49 @@ assert (bvalid == false);
<code>find_invalid(start, end) == end;</code>. You may want to use
it to make sure that a byte seqence is a valid UTF-8 string without
the need to know where it fails if it is not valid.</p>
<h4>utf8::replace_invalid</h4>
<p>Replaces all invalid UTF-8 sequences within a string with a replacement marker.</p>
<p><code>template &lt;typename octet_iterator, typename output_iterator&gt;
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
</code></p>
<p><code>template &lt;typename octet_iterator, typename output_iterator&gt;
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
</code></p>
<p>
<code>start</code>: an iterator pointing to the beginning of the
UTF-8 string to look for invalid UTF-8 sequences.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
string to look for invalid UTF-8 sequences.<br>
<code>out</code>: An output iterator to the range where the result of replacement is stored.<br>
<code>replacement</code>: A Unicode code point for the replacement marker. The version without this
parameter assumes the value <code>0xfffd</code><br>
<u>Return value</u>: An iterator pointing to the place after the UTF-8 string with
replaced invalid sequences.</p>
<p>Example of use:</p>
<pre>
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
vector&lt;char&gt; replace_invalid_result;
replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char* fixed_invalid_sequence = "a????z";
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
</pre>
<p>
<code>replace_invalid</code> does not perform in-place replacement of invalid sequences. Rather, it produces a copy
of the original string with the invalid sequences replaced with a replacement marker. Therefore, <code>out</code> must
not be in the <code>[start, end]</code> range.
</p>
<p>If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a <code>utf8::not_enough_room</code>
exception is thrown.</p>
<h4>utf8::is_bom</h4>
<p>Checks whether a sequence of three octets is a UTF-8 byte order
mark (BOM)</p>
<code>template &lt;typename octet_iterator&gt; bool is_bom
(octet_iterator it);</code>
<p><code>it</code> Beginning of the 3-octet sequence to check<br>
<p><code>it</code>: beginning of the 3-octet sequence to check<br>
<u>Return value</u>: <code>true</code> if the sequence is UTF-8
byte order mark; <code>false</code> if not.</p>
<p>Example of use:</p>

View file

@ -136,7 +136,7 @@ namespace internal
enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0)
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
{
uint32_t cp = mask8(*it);
// Check the lead octet
@ -249,6 +249,11 @@ namespace internal
return OK;
}
template <typename octet_iterator>
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
return validate_next(it, end, 0);
}
} // namespace internal
/// The library API - functions intended to be called by the users
@ -256,6 +261,44 @@ namespace internal
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = internal::validate_next(start, end);
switch (err_code) {
case internal::OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
throw not_enough_room();
case internal::INVALID_LEAD:
append (replacement, out);
++start;
break;
case internal::INCOMPLETE_SEQUENCE:
case internal::OVERLONG_SEQUENCE:
case internal::INVALID_CODE_POINT:
append (replacement, out);
++start;
// just one replacement mark for the sequence
while (internal::is_trail(*start) && start != end)
++start;
break;
}
}
return out;
}
template <typename octet_iterator, typename output_iterator>
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
{
static const uint32_t replacement_marker = internal::mask16(0xfffd);
return replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
@ -269,13 +312,13 @@ namespace internal
}
template <typename octet_iterator>
bool is_valid(octet_iterator start, octet_iterator end)
inline bool is_valid(octet_iterator start, octet_iterator end)
{
return (find_invalid(start, end) == end);
}
template <typename octet_iterator>
bool is_bom (octet_iterator it)
inline bool is_bom (octet_iterator it)
{
return (
(internal::mask8(*it++)) == bom[0] &&

View file

@ -121,6 +121,15 @@ int main()
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark);
assert (bbom == true);
//replace_invalid
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
vector<char> replace_invalid_result;
replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char* fixed_invalid_sequence = "a????z";
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
//////////////////////////////////////////////////////////
//// Unchecked variants