Implemented replace_invalid functionality

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@61 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-07 21:25:47 +00:00 · 2006-10-07 21:25:47 +00:00 · f0fce39119
commit f0fce39119
parent 8af502d493
3 changed files with 95 additions and 6 deletions
--- a/v2_0/doc/utf8cpp.html
+++ b/v2_0/doc/utf8cpp.html
@ -324,7 +324,7 @@ assert (utf16result[3] == 0xdd1e);
 </pre>
 <p>In case of an invalid UTF-8 seqence, a
 <code>utf8::invalid_utf8</code> exception is thrown. If
-<code>last</code> does not point to the past-of-end of a UTF-8
+<code>end</code> does not point to the past-of-end of a UTF-8
 seqence, a <code>utf8::not_enough_room</code> exception is
 thrown.</p>
 <h4>utf8::utf32to8</h4>
@ -373,7 +373,7 @@ assert (utf32result.size() == 2);
 </pre>
 <p>In case of an invalid UTF-8 seqence, a
 <code>utf8::invalid_utf8</code> exception is thrown. If
-<code>last</code> does not point to the past-of-end of a UTF-8
+<code>end</code> does not point to the past-of-end of a UTF-8
 seqence, a <code>utf8::not_enough_room</code> exception is
 thrown.</p>
 <h4>utf8::find_invalid</h4>
@ -421,12 +421,49 @@ assert (bvalid == false);
 <code>find_invalid(start, end) == end;</code>. You may want to use
 it to make sure that a byte seqence is a valid UTF-8 string without
 the need to know where it fails if it is not valid.</p>
+<h4>utf8::replace_invalid</h4>
+<p>Replaces all invalid UTF-8 sequences within a string with a replacement marker.</p>
+<p><code>template &lt;typename octet_iterator, typename output_iterator&gt;
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+</code></p>
+<p><code>template &lt;typename octet_iterator, typename output_iterator&gt;
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+</code></p>
+<p>
+<code>start</code>: an iterator pointing to the beginning of the
+UTF-8 string to look for invalid UTF-8 sequences.<br>
+<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
+string to look for invalid UTF-8 sequences.<br>
+<code>out</code>: An output iterator to the range where the result of replacement is stored.<br>
+<code>replacement</code>: A Unicode code point for the replacement marker. The version without this
+parameter assumes the value <code>0xfffd</code><br>
+<u>Return value</u>: An iterator pointing to the place after the UTF-8 string with
+replaced invalid sequences.</p>
+<p>Example of use:</p>
+<pre>
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector&lt;char&gt; replace_invalid_result;
+
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+
+bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+assert (bvalid);
+char* fixed_invalid_sequence = "a????z";
+assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
+</pre>
+<p>
+<code>replace_invalid</code> does not perform in-place replacement of invalid sequences. Rather, it produces a copy
+of the original string with the invalid sequences replaced with a replacement marker. Therefore, <code>out</code> must 
+not be in the <code>[start, end]</code> range.
+</p>
+<p>If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a <code>utf8::not_enough_room</code>
+exception is thrown.</p>
 <h4>utf8::is_bom</h4>
 <p>Checks whether a sequence of three octets is a UTF-8 byte order
 mark (BOM)</p>
 <code>template &lt;typename octet_iterator&gt; bool is_bom
 (octet_iterator it);</code>
-<p><code>it</code> Beginning of the 3-octet sequence to check<br>
+<p><code>it</code>: beginning of the 3-octet sequence to check<br>
 <u>Return value</u>: <code>true</code> if the sequence is UTF-8
 byte order mark; <code>false</code> if not.</p>
 <p>Example of use:</p>
--- a/v2_0/source/utf8.h
+++ b/v2_0/source/utf8.h
@ -136,7 +136,7 @@ namespace internal
    enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};

    template <typename octet_iterator>
-    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0)
+    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
    {
        uint32_t cp = mask8(*it);
        // Check the lead octet
@ -249,6 +249,11 @@ namespace internal
        return OK;    
    }

+    template <typename octet_iterator>
+    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+        return validate_next(it, end, 0);
+    }
+
 } // namespace internal 
    
    /// The library API - functions intended to be called by the users
@ -256,6 +261,44 @@ namespace internal
    // Byte order mark
    const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 

+    template <typename octet_iterator, typename output_iterator>
+    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+    {
+        while (start != end) {
+            octet_iterator sequence_start = start;
+            internal::utf_error err_code = internal::validate_next(start, end);
+            switch (err_code) {
+                case internal::OK :
+                    for (octet_iterator it = sequence_start; it != start; ++it)
+                        *out++ = *it;
+                    break;
+                case internal::NOT_ENOUGH_ROOM:
+                    throw not_enough_room();
+                case internal::INVALID_LEAD:
+                    append (replacement, out);
+                    ++start;
+                    break;
+                case internal::INCOMPLETE_SEQUENCE:
+                case internal::OVERLONG_SEQUENCE:
+                case internal::INVALID_CODE_POINT:
+                    append (replacement, out);
+                    ++start;
+                    // just one replacement mark for the sequence
+                    while (internal::is_trail(*start) && start != end)
+                        ++start;
+                    break;
+            }
+        }   
+        return out;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+    {
+        static const uint32_t replacement_marker = internal::mask16(0xfffd);
+        return replace_invalid(start, end, out, replacement_marker);
+    }
+
    template <typename octet_iterator>
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
    {
@ -269,13 +312,13 @@ namespace internal
    }

    template <typename octet_iterator>
-    bool is_valid(octet_iterator start, octet_iterator end)
+    inline bool is_valid(octet_iterator start, octet_iterator end)
    {
        return (find_invalid(start, end) == end);
    }

    template <typename octet_iterator>
-    bool is_bom (octet_iterator it)
+    inline bool is_bom (octet_iterator it)
    {
        return (
            (internal::mask8(*it++)) == bom[0] &&
--- a/v2_0/test_drivers/smoke_test/test.cpp
+++ b/v2_0/test_drivers/smoke_test/test.cpp
@ -121,6 +121,15 @@ int main()
    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
    bool bbom = is_bom(byte_order_mark);
    assert (bbom == true);
+    
+    //replace_invalid
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+    bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    assert (bvalid);
+    char* fixed_invalid_sequence = "a????z";
+    assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));

    //////////////////////////////////////////////////////////
    //// Unchecked variants