diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index 069c2be..4bd700f 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), utf8::is_bom

- Available in version 1.0 and later. + Available in version 1.0 and later (the deprecated one argument version) and in version 2.3 + and later (the two argument version).

Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) @@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),

 template <typename octet_iterator> 
-bool is_bom (octet_iterator it);
+bool is_bom (octet_iterator it, octet_iterator end);
+template <typename octet_iterator> 
+bool is_bom (octet_iterator it);  // Deprecated
 

it: beginning of the 3-octet sequence to check
+ end: pass-end of the sequence to check
Return value: true if the sequence is UTF-8 byte order mark; false if not.

@@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; -bool bbom = is_bom(byte_order_mark); +bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); assert (bbom == true);

@@ -938,6 +943,11 @@ assert (bbom == true); they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

+

+ The older version of the function that takes only one argument is unsafe: if a sequence is + shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated + in favor of the safer version that takes the end of sequence as an argument. +

Types From utf8 Namespace

diff --git a/source/utf8/core.h b/source/utf8/core.h index b4d4a1c..de65fbf 100644 --- a/source/utf8/core.h +++ b/source/utf8/core.h @@ -331,6 +331,17 @@ namespace internal return (find_invalid(start, end) == end); } + template + inline bool is_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (internal::mask8(*it++)) == bom[0]) && + ((it != end) && (internal::mask8(*it++)) == bom[1]) && + ((it != end) && (internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 template inline bool is_bom (octet_iterator it) { diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp index 2f0c99c..c344a49 100644 --- a/test_drivers/smoke_test/test.cpp +++ b/test_drivers/smoke_test/test.cpp @@ -141,8 +141,13 @@ int main() //is_bom unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - bool bbom = is_bom(byte_order_mark); + bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); assert (bbom == true); + bool no_bbom = is_bom(threechars, threechars + sizeof(threechars)); + assert (no_bbom == false); + bool unsafe_bbom = is_bom(byte_order_mark); + assert (unsafe_bbom == true); + //replace_invalid char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";