Feature request 2857462: Proposed minor extension: safe version of is_bom
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
2ca22999da
commit
4bfad91501
3 changed files with 30 additions and 4 deletions
|
@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
utf8::is_bom
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
|
||||
and later (the two argument version).
|
||||
</p>
|
||||
<p>
|
||||
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
|
||||
|
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
<pre>
|
||||
<span class="keyword">template</span> <<span class=
|
||||
"keyword">typename</span> octet_iterator>
|
||||
<span class="keyword">bool</span> is_bom (octet_iterator it);
|
||||
<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
|
||||
<span class="keyword">template</span> <<span class=
|
||||
"keyword">typename</span> octet_iterator>
|
||||
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
|
||||
</pre>
|
||||
<p>
|
||||
<code>it</code>: beginning of the 3-octet sequence to check<br>
|
||||
<code>end</code>: pass-end of the sequence to check<br>
|
||||
<span class="return_value">Return value</span>: <code>true</code> if the sequence
|
||||
is UTF-8 byte order mark; <code>false</code> if not.
|
||||
</p>
|
||||
|
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
|
||||
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
|
||||
"literal">0xbf</span>};
|
||||
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark);
|
||||
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
|
||||
assert (bbom == <span class="literal">true</span>);
|
||||
</pre>
|
||||
<p>
|
||||
|
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
|
|||
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
|
||||
encoded text.
|
||||
</p>
|
||||
<p>
|
||||
The older version of the function that takes only one argument is unsafe: if a sequence is
|
||||
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
|
||||
in favor of the safer version that takes the end of sequence as an argument.
|
||||
</p>
|
||||
<h3 id="typesutf8">
|
||||
Types From utf8 Namespace
|
||||
</h3>
|
||||
|
|
|
@ -331,6 +331,17 @@ namespace internal
|
|||
return (find_invalid(start, end) == end);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool is_bom (octet_iterator it, octet_iterator end)
|
||||
{
|
||||
return (
|
||||
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
|
||||
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
|
||||
((it != end) && (internal::mask8(*it)) == bom[2])
|
||||
);
|
||||
}
|
||||
|
||||
//Deprecated in release 2.3
|
||||
template <typename octet_iterator>
|
||||
inline bool is_bom (octet_iterator it)
|
||||
{
|
||||
|
|
|
@ -141,8 +141,13 @@ int main()
|
|||
|
||||
//is_bom
|
||||
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
|
||||
bool bbom = is_bom(byte_order_mark);
|
||||
bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
|
||||
assert (bbom == true);
|
||||
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
|
||||
assert (no_bbom == false);
|
||||
bool unsafe_bbom = is_bom(byte_order_mark);
|
||||
assert (unsafe_bbom == true);
|
||||
|
||||
|
||||
//replace_invalid
|
||||
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||
|
|
Loading…
Reference in a new issue