Feature request 2857462: Proposed minor extension: safe version of is_bom

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2009-12-20 22:46:01 +00:00
parent ac756dc9d6
commit 656f3847e8
3 changed files with 30 additions and 4 deletions

View file

@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
utf8::is_bom
</h4>
<p class="version">
Available in version 1.0 and later.
Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
and later (the two argument version).
</p>
<p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it);
<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
</pre>
<p>
<code>it</code>: beginning of the 3-octet sequence to check<br>
<code>end</code>: pass-end of the sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
is UTF-8 byte order mark; <code>false</code> if not.
</p>
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
"literal">0xbf</span>};
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark);
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
assert (bbom == <span class="literal">true</span>);
</pre>
<p>
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
</p>
<p>
The older version of the function that takes only one argument is unsafe: if a sequence is
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
in favor of the safer version that takes the end of sequence as an argument.
</p>
<h3 id="typesutf8">
Types From utf8 Namespace
</h3>

View file

@ -331,6 +331,17 @@ namespace internal
return (find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool is_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
((it != end) && (internal::mask8(*it)) == bom[2])
);
}
//Deprecated in release 2.3
template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{

View file

@ -141,8 +141,13 @@ int main()
//is_bom
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark);
bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true);
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
assert (no_bbom == false);
bool unsafe_bbom = is_bom(byte_order_mark);
assert (unsafe_bbom == true);
//replace_invalid
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";