From e628e3722381a732163019e44b6e195bdb826c47 Mon Sep 17 00:00:00 2001
From: ntrifunovic
-
@@ -326,6 +327,7 @@ uint32_t next(octet_iterator& it, octet_iterator end);
+
+
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
+
+
+
+
+
+
+
+
+
+
+
+ Available in version 2.3 and later. Relaces deprecated
+ Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)
+
+
+ Example of use:
+
+ The typical use of this function is to check the first three bytes of a file. If
+ they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
+ encoded text.
+
- Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
- and later (the two argument version).
+ Available in version 1.0 and later. Deprecated in version 2.3.
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@@ -917,14 +973,11 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
+
@@ -275,11 +275,12 @@ octet_iterator append(uint32_t cp, octet_iterator result);
cp
: A 32 bit integer representing a code point to append to the
+ octet_iterator
: an output iterator.
+ cp
: a 32 bit integer representing a code point to append to the
sequence.
- result
: An output iterator to the place in the sequence where to
+ result
: an output iterator to the place in the sequence where to
append the code point.
- Return value: An iterator pointing to the place
+ Return value: an iterator pointing to the place
after the newly appended sequence.
octet_iterator
: an input iterator.
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.
@@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
octet_iterator
: an input iterator.
it
: an iterator pointing to the beginning of an UTF-8
encoded code point.
end
: end of the UTF-8 sequence to be processed. If it
@@ -400,7 +403,7 @@ assert (w == twochars);
Available in version 1.02 and later.
octet_iterator
: a bidirectional iterator.
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
@@ -469,6 +473,7 @@ uint32_t previous(octet_iterator& it, octet_iterator pass_start);
octet_iterator
: a random access iterator.
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
@@ -529,6 +534,8 @@ assert (w == twochars);
octet_iterator
: an input iterator.
+ distance_type
: an integral type convertible to octet_iterator
's difference type.
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.
@@ -574,8 +581,9 @@ assert (w == twochars + 5);
octet_iterator
: an input iterator.
first
: an iterator to a beginning of a UTF-8 encoded code point.
- last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
point in the sequence we are trying to determine the length. It can be the
beginning of a new code point, or not.
Return value the distance between the iterators,
@@ -619,6 +627,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
u16bit_iterator
: an input iterator.
+ octet_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-16 encoded
@@ -661,6 +671,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
octet_iterator
: an input iterator.
+ u16bit_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. < br /> end
: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.
@@ -705,6 +717,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
octet_iterator
: an output iterator.
+ u32bit_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-32 encoded
@@ -747,6 +761,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
octet_iterator
: an input iterator.
+ u32bit_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
@@ -787,6 +803,7 @@ assert (utf32result.size() == 2);
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
octet_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
end
: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -827,6 +844,7 @@ assert (invalid == utf_invalid + 5);
octet_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
end
: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -868,6 +886,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
octet_iterator
: an input iterator.
+ output_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.
end
: an iterator pointing to pass-the-end of the UTF-8 string to look
@@ -904,12 +924,48 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
If end
does not point to the past-of-end of a UTF-8 sequence, a
utf8::not_enough_room
exception is thrown.
+ utf8::starts_with_bom
+
+ is_bom()
function.
+
+template <typename octet_iterator>
+bool starts_with_bom (octet_iterator it, octet_iterator end);
+
+ octet_iterator
: an input iterator.
+ it
: beginning of the octet sequence to check
+ end
: pass-end of the sequence to check
+ Return value: true
if the sequence
+ starts with a UTF-8 byte order mark; false
if not.
+
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+assert (bbom == true);
+
+
utf8::is_bom
starts_with_bom()
should be used
+ instead.
template <typename octet_iterator>
-bool is_bom (octet_iterator it, octet_iterator end);
-template <typename octet_iterator>
bool is_bom (octet_iterator it); // Deprecated
octet_iterator
: an input iterator.
it
: beginning of the 3-octet sequence to check
- end
: pass-end of the sequence to check
Return value: true
if the sequence
is UTF-8 byte order mark; false
if not.
- The older version of the function that takes only one argument is unsafe: if a sequence is
- shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
- in favor of the safer version that takes the end of sequence as an argument.
+ If a sequence is
+ shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
+ in favor of starts_with_bom()
that takes the end of sequence as an argument.
+ Available in version 2.3 and later. +
++ Base class for the exceptions thrown by UTF CPP library functions. +
++class exception : public std::exception {}; ++
+ Example of use: +
++try { + code_that_uses_utf_cpp_library(); +} +catch(const utf8::exception& utfcpp_ex) { + cerr << utfcpp_ex.what(); +} ++ +
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as advance
and next
if an UTF-8 sequence represents and invalid code point.
+
+class invalid_code_point : public exception { +public: + uint32_t code_point() const; +}; + ++
+ Member function code_point()
can be used to determine the invalid code point that
+ caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as next
and prior
if an invalid UTF-8 sequence
+ is detected during decoding.
+
+class invalid_utf8 : public exception { +public: + uint8_t utf8_octet() const; +}; ++ +
+ Member function utf8_octet()
can be used to determine the beginning of the byte
+ sequence that caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP function utf16to8
if an invalid UTF-16 sequence
+ is detected during decoding.
+
+class invalid_utf16 : public exception { +public: + uint16_t utf16_word() const; +}; ++ +
+ Member function utf16_word()
can be used to determine the UTF-16 code unit
+ that caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as next
if the end of the decoded UTF-8 sequence
+ was reached before the code point was decoded.
+
+class not_enough_room : public exception {}; +