Feature request 2857462: Proposed minor extension: safe version of is_bom
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
2ca22999da
commit
4bfad91501
3 changed files with 30 additions and 4 deletions
|
@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
||||||
utf8::is_bom
|
utf8::is_bom
|
||||||
</h4>
|
</h4>
|
||||||
<p class="version">
|
<p class="version">
|
||||||
Available in version 1.0 and later.
|
Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
|
||||||
|
and later (the two argument version).
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
|
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
|
||||||
|
@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
||||||
<pre>
|
<pre>
|
||||||
<span class="keyword">template</span> <<span class=
|
<span class="keyword">template</span> <<span class=
|
||||||
"keyword">typename</span> octet_iterator>
|
"keyword">typename</span> octet_iterator>
|
||||||
<span class="keyword">bool</span> is_bom (octet_iterator it);
|
<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
|
||||||
|
<span class="keyword">template</span> <<span class=
|
||||||
|
"keyword">typename</span> octet_iterator>
|
||||||
|
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
|
||||||
</pre>
|
</pre>
|
||||||
<p>
|
<p>
|
||||||
<code>it</code>: beginning of the 3-octet sequence to check<br>
|
<code>it</code>: beginning of the 3-octet sequence to check<br>
|
||||||
|
<code>end</code>: pass-end of the sequence to check<br>
|
||||||
<span class="return_value">Return value</span>: <code>true</code> if the sequence
|
<span class="return_value">Return value</span>: <code>true</code> if the sequence
|
||||||
is UTF-8 byte order mark; <code>false</code> if not.
|
is UTF-8 byte order mark; <code>false</code> if not.
|
||||||
</p>
|
</p>
|
||||||
|
@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
||||||
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
|
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
|
||||||
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
|
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
|
||||||
"literal">0xbf</span>};
|
"literal">0xbf</span>};
|
||||||
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark);
|
<span class="keyword">bool</span> bbom = is_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
|
||||||
assert (bbom == <span class="literal">true</span>);
|
assert (bbom == <span class="literal">true</span>);
|
||||||
</pre>
|
</pre>
|
||||||
<p>
|
<p>
|
||||||
|
@ -938,6 +943,11 @@ assert (bbom == <span class="literal">true</span>);
|
||||||
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
|
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
|
||||||
encoded text.
|
encoded text.
|
||||||
</p>
|
</p>
|
||||||
|
<p>
|
||||||
|
The older version of the function that takes only one argument is unsafe: if a sequence is
|
||||||
|
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
|
||||||
|
in favor of the safer version that takes the end of sequence as an argument.
|
||||||
|
</p>
|
||||||
<h3 id="typesutf8">
|
<h3 id="typesutf8">
|
||||||
Types From utf8 Namespace
|
Types From utf8 Namespace
|
||||||
</h3>
|
</h3>
|
||||||
|
|
|
@ -331,6 +331,17 @@ namespace internal
|
||||||
return (find_invalid(start, end) == end);
|
return (find_invalid(start, end) == end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
inline bool is_bom (octet_iterator it, octet_iterator end)
|
||||||
|
{
|
||||||
|
return (
|
||||||
|
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
|
||||||
|
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
|
||||||
|
((it != end) && (internal::mask8(*it)) == bom[2])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Deprecated in release 2.3
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
inline bool is_bom (octet_iterator it)
|
inline bool is_bom (octet_iterator it)
|
||||||
{
|
{
|
||||||
|
|
|
@ -141,8 +141,13 @@ int main()
|
||||||
|
|
||||||
//is_bom
|
//is_bom
|
||||||
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
|
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
|
||||||
bool bbom = is_bom(byte_order_mark);
|
bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
|
||||||
assert (bbom == true);
|
assert (bbom == true);
|
||||||
|
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
|
||||||
|
assert (no_bbom == false);
|
||||||
|
bool unsafe_bbom = is_bom(byte_order_mark);
|
||||||
|
assert (unsafe_bbom == true);
|
||||||
|
|
||||||
|
|
||||||
//replace_invalid
|
//replace_invalid
|
||||||
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||||
|
|
Loading…
Add table
Reference in a new issue