diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index e6ba254..c915572 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -1,97 +1,131 @@ - + -
- - - -Many C++ developers miss an easy and portable way of handling -Unicode encoded strings. C++ Standard is currently Unicode -agnostic, and while some work is being done to introduce Unicode to -the next incarnation called C++0x, for the moment nothing of the -sort is available. In the meantime, developers use 3rd party -libraries like ICU, OS specific capabilities, or simply roll out -their own solutions.
-In order to easily handle UTF-8 encoded Unicode strings, I have -come up with a set of template functions. For anybody used to work -with STL algorithms, they should be easy and natural to use. The -code is freely available for any purpose - check out the license at -the beginning of the utf8.h file. Be aware, though, that while I -did some testing, this library has not been used in production yet. -If you run into bugs or performance issues, please let me know and -I'll do my best to address them.
-The purpose of this article is not to offer an introduction to -Unicode in general, and UTF-8 in particular. If you are not -familiar with Unicode, be sure to check out Unicode Home Page or some other -source of information for Unicode. Also, it is not my aim to -advocate the use of UTF-8 encoded strings in C++ programs; if you -want to handle UTF-8 encoded strings from C++, I am sure you have -good reasons for it.
-To illustrate the use of this utf8 library, we shall open a file
-containing UTF-8 encoded text, check whether it starts with a byte order mark,
-read each line into a std::string
, check it for validity, convert the text to UTF-16,
-and back to UTF-8:
-#include <fstream> -#include <iostream> -#include <string> -#include <vector> -using namespace std; - -int main() -{ - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; + + + + ++ UTF8-CPP: UTF-8 with C++ in a Portable Way + + + + ++ The Sourceforge project page +
++ Table of Contents +
+
+ Many C++ developers miss an easy and portable way of handling Unicode encoded + strings. C++ Standard is currently Unicode agnostic, and while some work is being + done to introduce Unicode to the next incarnation called C++0x, for the moment + nothing of the sort is available. In the meantime, developers use 3rd party + libraries like ICU, OS specific capabilities, or simply roll out their own + solutions. +
++ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set + of template functions. For anybody used to work with STL algorithms, they should be + easy and natural to use. The code is freely available for any purpose - check out + the license at the beginning of the utf8.h file. Be aware, though, that while I did + some testing, this library has not been used in production yet. If you run into + bugs or performance issues, please let me know and I'll do my best to address them. +
++ The purpose of this article is not to offer an introduction to Unicode in general, + and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out + Unicode Home Page or some other source of + information for Unicode. Also, it is not my aim to advocate the use of UTF-8 + encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from + C++, I am sure you have good reasons for it. +
+
+ To illustrate the use of this utf8 library, we shall open a file containing UTF-8
+ encoded text, check whether it starts with a byte order mark, read each line into a
+ std::string
, check it for validity, convert the text to UTF-16, and
+ back to UTF-8:
+
+#include <fstream> +#include <iostream> +#include <string> +#include <vector> +using namespace std; +int main() +{ + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; + } + const char* test_file_path = argv[1]; // Open the test file (must be UTF-8 encoded) ifstream fs8(test_file_path); - if (!fs8.is_open()) { + if (!fs8.is_open()) { cout << "Could not open " << test_file_path << endl; - return 0; + return 0; } - // Read the first line of the file - unsigned line_count = 1; + unsigned line_count = 1; string line; - if (!getline(fs8, line)) - return 0; - + if (!getline(fs8, line)) + return 0; // Look for utf-8 byte-order mark at the beginning - if (line.size() > 2) { - if (utf8::is_bom(line.c_str())) - cout << "There is a byte order mark at the beginning of the file\n"; + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; } - // Play with all the lines in the file - do { + do { // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); - if (end_it != line.end()) { + if (end_it != line.end()) { cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; } // Get the line length (at least for the valid part) - int length = utf8::distance(line.begin(), end_it); + int length = utf8::distance(line.begin(), end_it); cout << "Length of line " << line_count << " is " << length << "\n"; - // Convert it to utf-16 vector<unsigned short> utf16line; utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); @@ -99,663 +133,846 @@ int main() string utf8line; utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) + if (utf8line != string(line.begin(), end_it)) cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; - getline(fs8, line); line_count++; - } while (!fs8.eof()); - - return 0; + } while (!fs8.eof()); + return 0; }-
In the previous code sample, we have seen the use of the following functions
-from utf8
namespace: first we used is_bom
-function to detect UTF-8 byte order mark at the beginning of the
-file; then for each line we performed a detection of invalid UTF-8 sequences with find_invalid
;
-the number of characters (more precisely - the number of Unicode code points) in each line was determined
-with a use of utf8::distance
; finally, we have converted each line to UTF-16 encoding with
-utf8to16
and back to UTF-8 with utf16to8
.
-
Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-append(uint32_t cp, octet_iterator result);
-cp
: A 32 bit integer representing a code point to
-append to the sequence.
-result
: An output iterator to the place in the
-sequence where to append the code point.
-Return value: An iterator pointing to the place after the
-newly appended sequence.
Example of use:
+
+ In the previous code sample, we have seen the use of the following functions from
+ utf8
namespace: first we used is_bom
function to detect
+ UTF-8 byte order mark at the beginning of the file; then for each line we performed
+ a detection of invalid UTF-8 sequences with find_invalid
; the number
+ of characters (more precisely - the number of Unicode code points) in each line was
+ determined with a use of utf8::distance
; finally, we have converted
+ each line to UTF-16 encoding with utf8to16
and back to UTF-8 with
+ utf16to8
.
+
+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +
+
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place after the newly appended
+ sequence.
+
+ Example of use: +
-unsigned char u[5] = {0,0,0,0,0}; - -unsigned char* end = append(0x0448, u); - +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);-
Note that append
does not allocate any memory - it
-is the burden of the caller to make sure there is enough memory
-allocated for the operation. To make things more interesting,
-append
can add anywhere between 1 and 4 octets to the
-sequence. In practice, you would most often want to use
-std::back_inserter
to ensure that the necessary memory
-is allocated.
In case of an invalid code point, a
-utf8::invalid_code_point
exception is thrown.
Given the iterator to the beginning of the UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.
-template <typename octet_iterator> uint32_t
-next(octet_iterator& it, octet_iterator end);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the beginning of the next
-code point.
-end
: end of the UTF-8 sequence to be processed. If
-it
gets equal to end
during the
-extraction of a code point, an utf8::not_enough_room
-exception is thrown.
-Return value: the 32 bit representation of the processed
-UTF-8 code point.
Example of use:
+
+ Note that append
does not allocate any memory - it is the burden of
+ the caller to make sure there is enough memory allocated for the operation. To make
+ things more interesting, append
can add anywhere between 1 and 4
+ octets to the sequence. In practice, you would most often want to use
+ std::back_inserter
to ensure that the necessary memory is allocated.
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point and moves the iterator to the next position. +
+template <typename octet_iterator> uint32_t next(octet_iterator& it,
+ octet_iterator end);
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+ Return value: the 32 bit representation of the processed UTF-8 code point.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - -int cp = next(w, twochars + 6); - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = next(w, twochars + 6); assert (cp == 0x65e5); assert (w == twochars + 3);-
This function is typically used to iterate through a UTF-8 -encoded string.
-In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown.
Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.
-template <typename octet_iterator> uint32_t
-previous(octet_iterator& it, octet_iterator pass_start);
-it
: a reference pointing to an octet within a UTF-8
-encoded string. After the function returns, it is decremented to
-point to the beginning of the previous code point.
-pass_start
: an iterator to the point in the sequence
-where the search for the beginning of a code point is aborted if no
-result was reached. It is a safety measure to prevent passing the
-beginning of the string in the search for a UTF-8 lead octet.
-Return value: the 32 bit representation of the previous code
-point.
Example of use:
++ This function is typically used to iterate through a UTF-8 encoded string. +
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown.
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +
+template <typename octet_iterator> uint32_t previous(octet_iterator&
+ it, octet_iterator pass_start);
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ pass_start
: an iterator to the point in the sequence where the search
+ for the beginning of a code point is aborted if no result was reached. It is a
+ safety measure to prevent passing the beginning of the string in the search for a
+ UTF-8 lead octet.
+ Return value: the 32 bit representation of the previous code point.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars + 3; - -int cp = previous (w, twochars - 1); - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars + 3; +int cp = previous (w, twochars - 1); assert (cp == 0x65e5); assert (w == twochars);-
The primary purpose of this function is to iterate backwards
-through a UTF-8 encoded string. Therefore, it
will
-typically point to the beginning of a code point, and
-pass_start
will point to the octet just before the
-beginning of the string to ensure we don't go backwards too far.
-it
is decreased until it points to a lead UTF-8 octet,
-and then the UTF-8 sequence beginning with that octet is decoded to
-a 32 bit representation and returned.
In case pass_end
is reached before a UTF-8 lead
-octet is hit, or if an invalid UTF-8 sequence is started by the
-lead octet, an invalid_utf8
exception is thrown
Advances an iterator by the specified number of code points -within an UTF-8 sequence.
-template <typename octet_iterator, typename
-distance_type> void advance (octet_iterator& it,
-distance_type n, octet_iterator end);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the nth following code
-point.
-n
: a positive integer that shows how many code points
-we want to advance.
-end
: end of the UTF-8 sequence to be processed. If
-it
gets equal to end
during the
-extraction of a code point, an utf8::not_enough_room
-exception is thrown.
Example of use:
+
+ The primary purpose of this function is to iterate backwards through a UTF-8
+ encoded string. Therefore, it
will typically point to the beginning of
+ a code point, and pass_start
will point to the octet just before the
+ beginning of the string to ensure we don't go backwards too far. it
is
+ decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence
+ beginning with that octet is decoded to a 32 bit representation and returned.
+
+ In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
+ exception is thrown
+
+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +
+template <typename octet_iterator, typename distance_type> void advance
+ (octet_iterator& it, distance_type n, octet_iterator end);
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars; - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars; advance (w, 2, twochars + 6); - assert (w == twochars + 5);-
This function works only "forward". In case of a negative
-n
, there is no effect.
In case of an invalid code point, a
-utf8::invalid_code_point
exception is thrown.
Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.
-template <typename octet_iterator> typename
-std::iterator_traits<octet_iterator>::difference_type
-distance (octet_iterator first, octet_iterator last);
-first
: an iterator to a beginning of a UTF-8
-encoded code point.
-last
: an iterator to a "post-end" of the last UTF-8
-encoded code point in the sequence we are trying to determine the
-length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code
-points.
Example of use:
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +
+template <typename octet_iterator> typename
+ std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator
+ first, octet_iterator last);
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code point
+ in the sequence we are trying to determine the length. It can be the beginning of a
+ new code point, or not.
+ Return value the distance between the iterators, in code points.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
size_t dist = utf8::distance(twochars, twochars + 5);
-
assert (dist == 2);
-This function is used to find the length (in code points) of a
-UTF-8 encoded string. The reason it is called distance,
-rather than, say, length is mainly because developers are
-used that length is an O(1) function. Computing the length
-of an UTF-8 string is a linear operation, and it looked better to
-model it after std::distance
algorithm.
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename
-octet_iterator> octet_iterator utf16to8 (u16bit_iterator start,
-u16bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-16 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-16 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ This function is used to find the length (in code points) of a UTF-8 encoded
+ string. The reason it is called distance, rather than, say,
+ length is mainly because developers are used that length is an
+ O(1) function. Computing the length of an UTF-8 string is a linear operation, and
+ it looked better to model it after std::distance
algorithm.
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If last
does not point to the past-of-end of a UTF-8 seqence,
+ a utf8::not_enough_room
exception is thrown.
+
+ Converts a UTF-16 encoded string to UTF-8. +
+template <typename u16bit_iterator, typename octet_iterator>
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8
+ string.
+
+ Example of use: +
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; - +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - assert (utf8result.size() == 10);-
In case of invalid UTF-16 sequence, a
-utf8::invalid_utf16
exception is thrown.
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename
-octet_iterator> u16bit_iterator utf8to16 (octet_iterator start,
-octet_iterator end, u16bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert. < br /> end
: an
-iterator pointing to pass-the-end of the UTF-8 encoded string to
-convert.
-result
: an output iterator to the place in the UTF-16
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.
Example of use:
+
+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16
exception is
+ thrown.
+
+ Converts an UTF-8 encoded string to UTF-16 +
+template <typename u16bit_iterator, typename octet_iterator>
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-16
+ string.
+
+ Example of use: +
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; - +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - assert (utf16result.size() == 4); assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e);-
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-end
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename
-u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start,
-u32bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-32 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-32 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+ Converts a UTF-32 encoded string to UTF-8. +
+template <typename octet_iterator, typename u32bit_iterator>
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8
+ string.
+
+ Example of use: +
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; - +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - assert (utf8result.size() == 9);-
In case of invalid UTF-32 string, a
-utf8::invalid_code_point
exception is thrown.
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename
-u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start,
-octet_iterator end, u32bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the UTF-8
-encoded string to convert.
-result
: an output iterator to the place in the UTF-32
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.
Example of use:
+
+ In case of invalid UTF-32 string, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Converts a UTF-8 encoded string to UTF-32. +
+template <typename octet_iterator, typename u32bit_iterator>
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-32
+ string.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - assert (utf32result.size() == 2);-
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-end
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Detects an invalid sequence within a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-find_invalid(octet_iterator start, octet_iterator end);
-start
: an iterator pointing to the beginning of the
-UTF-8 string to test for validity.
-end
: an iterator pointing to pass-the-end of the UTF-8
-string to test for validity.
-Return value: an iterator pointing to the first invalid
-octet in the UTF-8 string. In case none were found, equals
-end
.
Example of use:
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+ Detects an invalid sequence within a UTF-8 string. +
+template <typename octet_iterator> octet_iterator
+ find_invalid(octet_iterator start, octet_iterator end);
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: an iterator pointing to the first invalid octet in the UTF-8
+ string. In case none were found, equals end
.
+
+ Example of use: +
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - -char* invalid = find_invalid(utf_invalid, utf_invalid + 6); - +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); assert (invalid == utf_invalid + 5);-
This function is typically used to make sure a UTF-8 string is -valid before processing it with other functions. It is especially -important to call it if before doing any of the unchecked -operations on it.
-Checks whether a sequence of octets is a valid UTF-8 string.
-template <typename octet_iterator> bool
-is_valid(octet_iterator start, octet_iterator end);
-start
: an iterator pointing to the beginning of the
-UTF-8 string to test for validity.
-end
: an iterator pointing to pass-the-end of the UTF-8
-string to test for validity.
-Return value: true
if the sequence is a valid
-UTF-8 string; false
if not.
+ This function is typically used to make sure a UTF-8 string is valid before + processing it with other functions. It is especially important to call it if before + doing any of the unchecked operations on it. +
++ Checks whether a sequence of octets is a valid UTF-8 string. +
+template <typename octet_iterator> bool is_valid(octet_iterator start,
+ octet_iterator end);
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: true
if the sequence is a valid UTF-8 string;
+ false
if not.
+
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - -bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); assert (bvalid == false);-
is_valid
is a shorthand for
-find_invalid(start, end) == end;
. You may want to use
-it to make sure that a byte seqence is a valid UTF-8 string without
-the need to know where it fails if it is not valid.
Replaces all invalid UTF-8 sequences within a string with a replacement marker.
-template <typename octet_iterator, typename output_iterator>
-output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
-
template <typename octet_iterator, typename output_iterator>
-output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
-
-start
: an iterator pointing to the beginning of the
-UTF-8 string to look for invalid UTF-8 sequences.
-end
: an iterator pointing to pass-the-end of the UTF-8
-string to look for invalid UTF-8 sequences.
-out
: An output iterator to the range where the result of replacement is stored.
-replacement
: A Unicode code point for the replacement marker. The version without this
-parameter assumes the value 0xfffd
-Return value: An iterator pointing to the place after the UTF-8 string with
-replaced invalid sequences.
Example of use:
+
+ is_valid
is a shorthand for find_invalid(start, end) ==
+ end;
. You may want to use it to make sure that a byte seqence is a valid
+ UTF-8 string without the need to know where it fails if it is not valid.
+
+ Replaces all invalid UTF-8 sequences within a string with a replacement marker. +
+
+ template <typename octet_iterator, typename output_iterator>
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end,
+ output_iterator out, uint32_t replacement);
+
+ template <typename octet_iterator, typename output_iterator>
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end,
+ output_iterator out);
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ look for invalid UTF-8 sequences.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to look
+ for invalid UTF-8 sequences.
+ out
: An output iterator to the range where the result of replacement
+ is stored.
+ replacement
: A Unicode code point for the replacement marker. The
+ version without this parameter assumes the value 0xfffd
+ Return value: An iterator pointing to the place after the UTF-8 string with
+ replaced invalid sequences.
+
+ Example of use: +
-char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; -vector<char> replace_invalid_result; - +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); - bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); assert (bvalid); -char* fixed_invalid_sequence = "a????z"; +char* fixed_invalid_sequence = "a????z"; assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));-
-replace_invalid
does not perform in-place replacement of invalid sequences. Rather, it produces a copy
-of the original string with the invalid sequences replaced with a replacement marker. Therefore, out
must
-not be in the [start, end]
range.
-
If end
does not point to the past-of-end of a UTF-8 sequence, a utf8::not_enough_room
-exception is thrown.
Checks whether a sequence of three octets is a UTF-8 byte order -mark (BOM)
-template <typename octet_iterator> bool is_bom
-(octet_iterator it);
-it
: beginning of the 3-octet sequence to check
-Return value: true
if the sequence is UTF-8
-byte order mark; false
if not.
Example of use:
+
+ replace_invalid
does not perform in-place replacement of invalid
+ sequences. Rather, it produces a copy of the original string with the invalid
+ sequences replaced with a replacement marker. Therefore, out
must not
+ be in the [start, end]
range.
+
+ If end
does not point to the past-of-end of a UTF-8 sequence, a
+ utf8::not_enough_room
exception is thrown.
+
+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) +
+template <typename octet_iterator> bool is_bom (octet_iterator
+ it);
+
+ it
: beginning of the 3-octet sequence to check
+ Return value: true
if the sequence is UTF-8 byte order mark;
+ false
if not.
+
+ Example of use: +
-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - -bool bbom = is_bom(byte_order_mark); - +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = is_bom(byte_order_mark); assert (bbom == true);-
The typical use of this function is to check the first three -bytes of a file. If they form the UTF-8 BOM, we want to skip them -before processing the actual UTF-8 encoded text.
-Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-append(uint32_t cp, octet_iterator result);
-cp
: A 32 bit integer representing a code point to
-append to the sequence.
-result
: An output iterator to the place in the
-sequence where to append the code point.
-Return value: An iterator pointing to the place after the
-newly appended sequence.
Example of use:
++ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +
++ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +
+template <typename octet_iterator> octet_iterator append(uint32_t cp,
+ octet_iterator result);
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place after the newly appended
+ sequence.
+
+ Example of use: +
-unsigned char u[5] = {0,0,0,0,0}; - -unsigned char* end = unchecked::append(0x0448, u); - +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = unchecked::append(0x0448, u); assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);-
This is a quicker but less safe version of
-utf8::append
. It does not check for validity of the
-supplied code point, and may produce an invalid UTF-8 sequence.
Given the iterator to the beginning of a UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.
-template <typename octet_iterator> uint32_t
-next(octet_iterator& it);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the beginning of the next
-code point.
-Return value: the 32 bit representation of the processed
-UTF-8 code point.
Example of use:
+
+ This is a quicker but less safe version of utf8::append
. It does not
+ check for validity of the supplied code point, and may produce an invalid UTF-8
+ sequence.
+
+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point + and moves the iterator to the next position. +
+template <typename octet_iterator> uint32_t next(octet_iterator&
+ it);
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ Return value: the 32 bit representation of the processed UTF-8 code point.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - -int cp = unchecked::next(w); - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = unchecked::next(w); assert (cp == 0x65e5); assert (w == twochars + 3);-
This is a quicker but less safe version of
-utf8::next
. It does not check for validity of the
-supplied UTF-8 sequence.
Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.
-template <typename octet_iterator> uint32_t
-previous(octet_iterator& it);
-it
: a reference pointing to an octet within a UTF-8
-encoded string. After the function returns, it is decremented to
-point to the beginning of the previous code point.
-Return value: the 32 bit representation of the previous code
-point.
Example of use:
+
+ This is a quicker but less safe version of utf8::next
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +
+template <typename octet_iterator> uint32_t previous(octet_iterator&
+ it);
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ Return value: the 32 bit representation of the previous code point.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars + 3; - -int cp = unchecked::previous (w); - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars + 3; +int cp = unchecked::previous (w); assert (cp == 0x65e5); assert (w == twochars);-
This is a quicker but less safe version of
-utf8::previous
. It does not check for validity of the
-supplied UTF-8 sequence and offers no boundary checking.
Advances an iterator by the specified number of code points -within an UTF-8 sequence.
-template <typename octet_iterator, typename
-distance_type> void advance (octet_iterator& it,
-distance_type n);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the nth following code
-point.
-n
: a positive integer that shows how many code points
-we want to advance.
Example of use:
+
+ This is a quicker but less safe version of utf8::previous
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +
+template <typename octet_iterator, typename distance_type> void advance
+ (octet_iterator& it, distance_type n);
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; unchecked::advance (w, 2); - assert (w == twochars + 5);-
This function works only "forward". In case of a negative
-n
, there is no effect.
This is a quicker but less safe version of
-utf8::advance
. It does not check for validity of the
-supplied UTF-8 sequence and offers no boundary checking.
Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.
-template <typename octet_iterator> typename
-std::iterator_traits<octet_iterator>::difference_type
-distance (octet_iterator first, octet_iterator last);
-first
: an iterator to a beginning of a UTF-8
-encoded code point.
-last
: an iterator to a "post-end" of the last UTF-8
-encoded code point in the sequence we are trying to determine the
-length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code
-points.
Example of use:
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+ This is a quicker but less safe version of utf8::advance
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +
+template <typename octet_iterator> typename
+ std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator
+ first, octet_iterator last);
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code point
+ in the sequence we are trying to determine the length. It can be the beginning of a
+ new code point, or not.
+ Return value the distance between the iterators, in code points.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
-
assert (dist == 2);
-This is a quicker but less safe version of
-utf8::distance
. It does not check for validity of the
-supplied UTF-8 sequence.
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename
-octet_iterator> octet_iterator utf16to8 (u16bit_iterator start,
-u16bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-16 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-16 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ This is a quicker but less safe version of utf8::distance
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Converts a UTF-16 encoded string to UTF-8. +
+template <typename u16bit_iterator, typename octet_iterator>
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion. Return value: An iterator pointing to the
+ place after the appended UTF-8 string.
+
+ Example of use: +
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; - +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - assert (utf8result.size() == 10);-
This is a quicker but less safe version of
-utf8::utf16to8
. It does not check for validity of the
-supplied UTF-16 sequence.
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename
-octet_iterator> u16bit_iterator utf8to16 (octet_iterator start,
-octet_iterator end, u16bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert. < br /> end
: an
-iterator pointing to pass-the-end of the UTF-8 encoded string to
-convert.
-result
: an output iterator to the place in the UTF-16
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.
-
Example of use:
+
+ This is a quicker but less safe version of utf8::utf16to8
. It does not
+ check for validity of the supplied UTF-16 sequence.
+
+ Converts an UTF-8 encoded string to UTF-16 +
+template <typename u16bit_iterator, typename octet_iterator>
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-16
+ string.
+
+ Example of use: +
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; - +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - assert (utf16result.size() == 4); assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e);-
This is a quicker but less safe version of
-utf8::utf8to16
. It does not check for validity of the
-supplied UTF-8 sequence.
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename
-u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start,
-u32bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-32 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-32 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
-
Example of use:
+
+ This is a quicker but less safe version of utf8::utf8to16
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Converts a UTF-32 encoded string to UTF-8. +
+template <typename octet_iterator, typename u32bit_iterator>
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8
+ string.
+
+ Example of use: +
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; - +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - assert (utf8result.size() == 9);-
This is a quicker but less safe version of
-utf8::utf32to8
. It does not check for validity of the
-supplied UTF-32 sequence.
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename
-u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start,
-octet_iterator end, u32bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the UTF-8
-encoded string to convert.
-result
: an output iterator to the place in the UTF-32
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.
-
Example of use:
+
+ This is a quicker but less safe version of utf8::utf32to8
. It does not
+ check for validity of the supplied UTF-32 sequence.
+
+ Converts a UTF-8 encoded string to UTF-32. +
+template <typename octet_iterator, typename u32bit_iterator>
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator
+ result);
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-32
+ string.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; - +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - assert (utf32result.size() == 2);-
This is a quicker but less safe version of
-utf8::utf8to32
. It does not check for validity of the
-supplied UTF-8 sequence.
The library was designed to be:
-In case you want to look into other means of working with UTF-8 -strings from C++, here is the list of solutions I am aware of:
-std::string
. If you prefer to have yet another
-string class in your code, it may be worth a look. Be aware of the
-licensing issues, though.Until Unicode becomes officially recognized by the C++ Standard -Library, we need to use other means to work with UTF-8 strings. -Template functions I describe in this article may be a good step in -this direction.
-
+ This is a quicker but less safe version of utf8::utf8to32
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ The library was designed to be: +
++ In case you want to look into other means of working with UTF-8 strings from C++, + here is the list of solutions I am aware of: +
+std::string
. If you prefer to have yet another string class in your
+ code, it may be worth a look. Be aware of the licensing issues, though.
+ + Until Unicode becomes officially recognized by the C++ Standard Library, we need to + use other means to work with UTF-8 strings. Template functions I describe in this + article may be a good step in this direction. +
+