diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html
index 069c2be..4bd700f 100644
--- a/doc/utf8cpp.html
+++ b/doc/utf8cpp.html
@@ -908,7 +908,8 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
utf8::is_bom
- Available in version 1.0 and later.
+ Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
+ and later (the two argument version).
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@@ -916,10 +917,14 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
template <typename octet_iterator>
-bool is_bom (octet_iterator it);
+bool is_bom (octet_iterator it, octet_iterator end);
+template <typename octet_iterator>
+bool is_bom (octet_iterator it);
it
: beginning of the 3-octet sequence to check
+ end
: pass-end of the sequence to check
Return value: true
if the sequence
is UTF-8 byte order mark; false
if not.
@@ -930,7 +935,7 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-bool bbom = is_bom(byte_order_mark);
+bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true);
@@ -938,6 +943,11 @@ assert (bbom == true);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
+
+ The older version of the function that takes only one argument is unsafe: if a sequence is
+ shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
+ in favor of the safer version that takes the end of sequence as an argument.
+
Types From utf8 Namespace
diff --git a/source/utf8/core.h b/source/utf8/core.h
index b4d4a1c..de65fbf 100644
--- a/source/utf8/core.h
+++ b/source/utf8/core.h
@@ -331,6 +331,17 @@ namespace internal
return (find_invalid(start, end) == end);
}
+ template
+ inline bool is_bom (octet_iterator it, octet_iterator end)
+ {
+ return (
+ ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
+ ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
+ ((it != end) && (internal::mask8(*it)) == bom[2])
+ );
+ }
+
+ //Deprecated in release 2.3
template
inline bool is_bom (octet_iterator it)
{
diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp
index 2f0c99c..c344a49 100644
--- a/test_drivers/smoke_test/test.cpp
+++ b/test_drivers/smoke_test/test.cpp
@@ -141,8 +141,13 @@ int main()
//is_bom
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
- bool bbom = is_bom(byte_order_mark);
+ bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true);
+ bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
+ assert (no_bbom == false);
+ bool unsafe_bbom = is_bom(byte_order_mark);
+ assert (unsafe_bbom == true);
+
//replace_invalid
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";