Feature request 2857462: Proposed minor extension: safe version of is_bom

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-20 22:46:01 +00:00 · 2009-12-20 22:46:01 +00:00 · 4bfad91501
commit 4bfad91501
parent 2ca22999da
3 changed files with 30 additions and 4 deletions
--- a/doc/utf8cpp.html
+++ b/doc/utf8cpp.html
@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
      utf8::is_bom
    </h4>
    <p class="version">
-    Available in version 1.0 and later.
+    Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
    and later (the two argument version).
    </p>
    <p>
      Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
 <pre>
 <span class="keyword">template</span> &lt;<span class=
 "keyword">typename</span> octet_iterator&gt; 
-<span class="keyword">bool</span> is_bom (octet_iterator it);
+<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
 <span class="keyword">template</span> &lt;<span class=
 "keyword">typename</span> octet_iterator&gt; 
 <span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
 </pre>
    <p>
      <code>it</code>: beginning of the 3-octet sequence to check<br>
      <code>end</code>: pass-end of the sequence to check<br>
       <span class="return_value">Return value</span>: <code>true</code> if the sequence
      is UTF-8 byte order mark; <code>false</code> if not.
    </p>
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
 <span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
 "literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
 "literal">0xbf</span>};
-<span class="keyword">bool</span> bbom = is_bom(byte_order_mark);
+<span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
 assert (bbom == <span class="literal">true</span>);
 </pre>
    <p>
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
      they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
      encoded text.
    </p>
    <p>
      The older version of the function that takes only one argument is unsafe: if a sequence is 
      shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
      in favor of the safer version that takes the end of sequence as an argument.
    </p>
    <h3 id="typesutf8">
      Types From utf8 Namespace
    </h3>
--- a/source/utf8/core.h
+++ b/source/utf8/core.h
@ -331,6 +331,17 @@ namespace internal
        return (find_invalid(start, end) == end);
    }
    template <typename octet_iterator>
    inline bool is_bom (octet_iterator it, octet_iterator end)
    {
        return (
            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
            ((it != end) && (internal::mask8(*it))   == bom[2])
           );
    }
 	//Deprecated in release 2.3 
    template <typename octet_iterator>
    inline bool is_bom (octet_iterator it)
    {
--- a/test_drivers/smoke_test/test.cpp
+++ b/test_drivers/smoke_test/test.cpp
@ -141,8 +141,13 @@ int main()
    //is_bom
    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-    bool bbom = is_bom(byte_order_mark);
+    bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
    assert (bbom == true);
 	bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
 	assert (no_bbom == false);
 	bool unsafe_bbom = is_bom(byte_order_mark);
    assert (unsafe_bbom == true);
    //replace_invalid
    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";