Feature request 2857462: Proposed minor extension: safe version of is_bom

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2009-12-20 22:46:01 +00:00 committed by King_DuckZ
parent 2ca22999da
commit 4bfad91501
3 changed files with 30 additions and 4 deletions

View file

@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
utf8::is_bom utf8::is_bom
</h4> </h4>
<p class="version"> <p class="version">
Available in version 1.0 and later. Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
and later (the two argument version).
</p> </p>
<p> <p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<pre> <pre>
<span class="keyword">template</span> &lt;<span class= <span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt; "keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
</pre> </pre>
<p> <p>
<code>it</code>: beginning of the 3-octet sequence to check<br> <code>it</code>: beginning of the 3-octet sequence to check<br>
<code>end</code>: pass-end of the sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence <span class="return_value">Return value</span>: <code>true</code> if the sequence
is UTF-8 byte order mark; <code>false</code> if not. is UTF-8 byte order mark; <code>false</code> if not.
</p> </p>
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class= <span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class= "literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
"literal">0xbf</span>}; "literal">0xbf</span>};
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark); <span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
assert (bbom == <span class="literal">true</span>); assert (bbom == <span class="literal">true</span>);
</pre> </pre>
<p> <p>
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text. encoded text.
</p> </p>
<p>
The older version of the function that takes only one argument is unsafe: if a sequence is
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
in favor of the safer version that takes the end of sequence as an argument.
</p>
<h3 id="typesutf8"> <h3 id="typesutf8">
Types From utf8 Namespace Types From utf8 Namespace
</h3> </h3>

View file

@ -331,6 +331,17 @@ namespace internal
return (find_invalid(start, end) == end); return (find_invalid(start, end) == end);
} }
template <typename octet_iterator>
inline bool is_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
((it != end) && (internal::mask8(*it)) == bom[2])
);
}
//Deprecated in release 2.3
template <typename octet_iterator> template <typename octet_iterator>
inline bool is_bom (octet_iterator it) inline bool is_bom (octet_iterator it)
{ {

View file

@ -141,8 +141,13 @@ int main()
//is_bom //is_bom
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark); bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true); assert (bbom == true);
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
assert (no_bbom == false);
bool unsafe_bbom = is_bom(byte_order_mark);
assert (unsafe_bbom == true);
//replace_invalid //replace_invalid
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";