Feature request 2857462: Proposed minor extension: safe version of is_bom

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-20 22:46:01 +00:00 · 2009-12-20 22:46:01 +00:00 · 656f3847e8
commit 656f3847e8
parent ac756dc9d6
3 changed files with 30 additions and 4 deletions
--- a/v2_0/doc/utf8cpp.html
+++ b/v2_0/doc/utf8cpp.html
@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
      utf8::is_bom
    </h4>
    <p class="version">
-    Available in version 1.0 and later.
+    Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
+    and later (the two argument version).
    </p>
    <p>
      Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
 <pre>
 <span class="keyword">template</span> &lt;<span class=
 "keyword">typename</span> octet_iterator&gt; 
-<span class="keyword">bool</span> is_bom (octet_iterator it);
+<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
+<span class="keyword">template</span> &lt;<span class=
+"keyword">typename</span> octet_iterator&gt; 
+<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
 </pre>
    <p>
      <code>it</code>: beginning of the 3-octet sequence to check<br>
+      <code>end</code>: pass-end of the sequence to check<br>
       <span class="return_value">Return value</span>: <code>true</code> if the sequence
      is UTF-8 byte order mark; <code>false</code> if not.
    </p>
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
 <span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
 "literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
 "literal">0xbf</span>};
-<span class="keyword">bool</span> bbom = is_bom(byte_order_mark);
+<span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
 assert (bbom == <span class="literal">true</span>);
 </pre>
    <p>
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
      they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
      encoded text.
    </p>
+    <p>
+      The older version of the function that takes only one argument is unsafe: if a sequence is 
+      shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
+      in favor of the safer version that takes the end of sequence as an argument.
+    </p>
    <h3 id="typesutf8">
      Types From utf8 Namespace
    </h3>
--- a/v2_0/source/utf8/core.h
+++ b/v2_0/source/utf8/core.h
@ -331,6 +331,17 @@ namespace internal
        return (find_invalid(start, end) == end);
    }

+    template <typename octet_iterator>
+    inline bool is_bom (octet_iterator it, octet_iterator end)
+    {
+        return (
+            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
+            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
+            ((it != end) && (internal::mask8(*it))   == bom[2])
+           );
+    }
+	
+	//Deprecated in release 2.3 
    template <typename octet_iterator>
    inline bool is_bom (octet_iterator it)
    {
--- a/v2_0/test_drivers/smoke_test/test.cpp
+++ b/v2_0/test_drivers/smoke_test/test.cpp
@ -141,8 +141,13 @@ int main()

    //is_bom
    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-    bool bbom = is_bom(byte_order_mark);
+    bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
    assert (bbom == true);
+	bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
+	assert (no_bbom == false);
+	bool unsafe_bbom = is_bom(byte_order_mark);
+    assert (unsafe_bbom == true);
+
    
    //replace_invalid
    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";