diff --git a/v1_0/doc/utf8cpp.html b/v1_0/doc/utf8cpp.html index d97edf4..b18a8ae 100644 --- a/v1_0/doc/utf8cpp.html +++ b/v1_0/doc/utf8cpp.html @@ -1,724 +1,1097 @@ - + -
- - - -Many C++ developers miss an easy and portable way of handling -Unicode encoded strings. C++ Standard is currently Unicode -agnostic, and while some work is being done to introduce Unicode to -the next incarnation called C++0x, for the moment nothing of the -sort is available. In the meantime, developers use 3rd party -libraries like ICU, OS specific capabilities, or simply roll out -their own solutions.
-In order to easily handle UTF-8 encoded Unicode strings, I have -come up with a set of template functions. For anybody used to work -with STL algorithms, they should be easy and natural to use. The -code is freely available for any purpose - check out the license at -the beginning of the utf8.h file. Be aware, though, that while I -did some testing, this library has not been used in production yet. -If you run into bugs or performance issues, please let me know and -I'll do my best to address them.
-The purpose of this article is not to offer an introduction to -Unicode in general, and UTF-8 in particular. If you are not -familiar with Unicode, be sure to check out Unicode Home Page or some other -source of information for Unicode. Also, it is not my aim to -advocate the use of UTF-8 encoded strings in C++ programs; if you -want to handle UTF-8 encoded strings from C++, I am sure you have -good reasons for it.
-To illustrate the use of this utf8 library, we shall open a file
-containing UTF-8 encoded text, check whether it starts with a byte order mark,
-read each line into a std::string
, check it for validity, convert the text to UTF-16,
-and back to UTF-8:
+ The Sourceforge project page +
++ Many C++ developers miss an easy and portable way of handling Unicode encoded + strings. C++ Standard is currently Unicode agnostic, and while some work is being + done to introduce Unicode to the next incarnation called C++0x, for the moment + nothing of the sort is available. In the meantime, developers use 3rd party + libraries like ICU, OS specific capabilities, or simply roll out their own + solutions. +
++ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set + of template functions. For anybody used to work with STL algorithms, they should be + easy and natural to use. The code is freely available for any purpose - check out + the license at the beginning of the utf8.h file. Be aware, though, that while I did + some testing, this library has not been used in production yet. If you run into + bugs or performance issues, please let me know and I'll do my best to address them. +
++ The purpose of this article is not to offer an introduction to Unicode in general, + and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out + Unicode Home Page or some other source of + information for Unicode. Also, it is not my aim to advocate the use of UTF-8 + encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from + C++, I am sure you have good reasons for it. +
+
+ To illustrate the use of this utf8 library, we shall open a file containing UTF-8
+ encoded text, check whether it starts with a byte order mark, read each line into a
+ std::string
, check it for validity, convert the text to UTF-16, and
+ back to UTF-8:
+
-#include <fstream> -#include <iostream> -#include <string> -#include <vector> -using namespace std; - -int main() +#include <fstream> +#include <iostream> +#include <string> +#include <vector> +using namespace std; +int main() { - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; } - const char* test_file_path = argv[1]; - // Open the test file (must be UTF-8 encoded) + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) ifstream fs8(test_file_path); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 0; + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; } - - // Read the first line of the file - unsigned line_count = 1; + // Read the first line of the file + unsigned line_count = 1; string line; - if (!getline(fs8, line)) - return 0; - - // Look for utf-8 byte-order mark at the beginning - if (line.size() > 2) { - if (utf8::is_bom(line.c_str())) - cout << "There is a byte order mark at the beginning of the file\n"; + if (!getline(fs8, line)) + return 0; + // Look for utf-8 byte-order mark at the beginning + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; } - - // Play with all the lines in the file - do { + // Play with all the lines in the file + do { // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); - if (end_it != line.end()) { - cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; - cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; } - // Get the line length (at least for the valid part) - int length = utf8::distance(line.begin(), end_it); - cout << "Length of line " << line_count << " is " << length << "\n"; - - // Convert it to utf-16 + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + // Convert it to utf-16 vector<unsigned short> utf16line; utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); - // And back to utf-8; + // And back to utf-8 string utf8line; utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); - // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) - cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; - + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; getline(fs8, line); line_count++; - } while (!fs8.eof()); - - return 0; + } while (!fs8.eof()); + return 0; }-
In the previous code sample, we have seen the use of the following functions
-from utf8
namespace: first we used is_bom
-function to detect UTF-8 byte order mark at the beginning of the
-file; then for each line we performed a detection of invalid UTF-8 sequences with find_invalid
;
-the number of characters (more precisely - the number of Unicode code points) in each line was determined
-with a use of utf8::distance
; finally, we have converted each line to UTF-16 encoding with
-utf8to16
and back to UTF-8 with utf16to8
.
-
Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-append(uint32_t cp, octet_iterator result);
-cp
: A 32 bit integer representing a code point to
-append to the sequence.
-result
: An output iterator to the place in the
-sequence where to append the code point.
-Return value: An iterator pointing to the place after the
-newly appended sequence.
Example of use:
+
+ In the previous code sample, we have seen the use of the following functions from
+ utf8
namespace: first we used is_bom
function to detect
+ UTF-8 byte order mark at the beginning of the file; then for each line we performed
+ a detection of invalid UTF-8 sequences with find_invalid
; the number
+ of characters (more precisely - the number of Unicode code points) in each line was
+ determined with a use of utf8::distance
; finally, we have converted
+ each line to UTF-16 encoding with utf8to16
and back to UTF-8 with
+ utf16to8
.
+
+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +
-unsigned char u[5] = {0,0,0,0,0}; - -unsigned char* end = append(0x0448, u); - -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); +-
Note that append
does not allocate any memory - it
-is the burden of the caller to make sure there is enough memory
-allocated for the operation. To make things more interesting,
-append
can add anywhere between 1 and 4 octets to the
-sequence. In practice, you would most often want to use
-std::back_inserter
to ensure that the necessary memory
-is allocated.
In case of an invalid code point, a
-utf8::invalid_code_point
exception is thrown.
Given the iterator to the beginning of the UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.
-template <typename octet_iterator> uint32_t
-next(octet_iterator& it, octet_iterator end);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the beginning of the next
-code point.
-end
: end of the UTF-8 sequence to be processed. If
-it
gets equal to end
during the
-extraction of a code point, an utf8::not_enough_room
-exception is thrown.
-Return value: the 32 bit representation of the processed
-UTF-8 code point.
Example of use:
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - -int cp = next(w, twochars + 6); - -assert (cp == 0x65e5); -assert (w == twochars + 3); +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);-
This function is typically used to iterate through a UTF-8 -encoded string.
-In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown.
Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.
-template <typename octet_iterator> uint32_t
-previous(octet_iterator& it, octet_iterator pass_start);
-it
: a reference pointing to an octet within a UTF-8
-encoded string. After the function returns, it is decremented to
-point to the beginning of the previous code point.
-pass_start
: an iterator to the point in the sequence
-where the search for the beginning of a code point is aborted if no
-result was reached. It is a safety measure to prevent passing the
-beginning of the string in the search for a UTF-8 lead octet.
-Return value: the 32 bit representation of the previous code
-point.
Example of use:
+
+ Note that append
does not allocate any memory - it is the burden of
+ the caller to make sure there is enough memory allocated for the operation. To make
+ things more interesting, append
can add anywhere between 1 and 4
+ octets to the sequence. In practice, you would most often want to use
+ std::back_inserter
to ensure that the necessary memory is allocated.
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point and moves the iterator to the next position. +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars + 3; - -int cp = previous (w, twochars - 1); - -assert (cp == 0x65e5); +template <typename octet_iterator> +uint32_t next(octet_iterator& it, octet_iterator end); + ++
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = next(w, twochars + 6); +assert (cp == 0x65e5); +assert (w == twochars + 3); ++
+ This function is typically used to iterate through a UTF-8 encoded string. +
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown.
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +
++template <typename octet_iterator> +uint32_t previous(octet_iterator& it, octet_iterator pass_start); + ++
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ pass_start
: an iterator to the point in the sequence where the search
+ for the beginning of a code point is aborted if no result was reached. It is a
+ safety measure to prevent passing the beginning of the string in the search for a
+ UTF-8 lead octet.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars + 3; +int cp = previous (w, twochars - 1); +assert (cp == 0x65e5); assert (w == twochars);-
The primary purpose of this function is to iterate backwards
-through a UTF-8 encoded string. Therefore, it
will
-typically point to the beginning of a code point, and
-pass_start
will point to the octet just before the
-beginning of the string to ensure we don't go backwards too far.
-it
is decreased until it points to a lead UTF-8 octet,
-and then the UTF-8 sequence beginning with that octet is decoded to
-a 32 bit representation and returned.
In case pass_end
is reached before a UTF-8 lead
-octet is hit, or if an invalid UTF-8 sequence is started by the
-lead octet, an invalid_utf8
exception is thrown
Advances an iterator by the specified number of code points -within an UTF-8 sequence.
-template <typename octet_iterator, typename
-distance_type> void advance (octet_iterator& it,
-distance_type n, octet_iterator end);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the nth following code
-point.
-n
: a positive integer that shows how many code points
-we want to advance.
-end
: end of the UTF-8 sequence to be processed. If
-it
gets equal to end
during the
-extraction of a code point, an utf8::not_enough_room
-exception is thrown.
Example of use:
+
+ The primary purpose of this function is to iterate backwards through a UTF-8
+ encoded string. Therefore, it
will typically point to the beginning of
+ a code point, and pass_start
will point to the octet just before the
+ beginning of the string to ensure we don't go backwards too far. it
is
+ decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence
+ beginning with that octet is decoded to a 32 bit representation and returned.
+
+ In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
+ exception is thrown
+
+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars; - -advance (w, 2, twochars + 6); - -assert (w == twochars + 5); +template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n, octet_iterator end); +-
This function works only "forward". In case of a negative
-n
, there is no effect.
In case of an invalid code point, a
-utf8::invalid_code_point
exception is thrown.
Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.
-template <typename octet_iterator> typename
-std::iterator_traits<octet_iterator>::difference_type
-distance (octet_iterator first, octet_iterator last);
-first
: an iterator to a beginning of a UTF-8
-encoded code point.
-last
: an iterator to a "post-end" of the last UTF-8
-encoded code point in the sequence we are trying to determine the
-length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code
-points.
Example of use:
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; - -size_t dist = utf8::distance(twochars, twochars + 5); - -assert (dist == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars; +advance (w, 2, twochars + 6); +assert (w == twochars + 5);-
This function is used to find the length (in code points) of a
-UTF-8 encoded string. The reason it is called distance,
-rather than, say, length is mainly because developers are
-used that length is an O(1) function. Computing the length
-of an UTF-8 string is a linear operation, and it looked better to
-model it after std::distance
algorithm.
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename
-octet_iterator> octet_iterator utf16to8 (u16bit_iterator start,
-u16bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-16 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-16 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; - -utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - -assert (utf8result.size() == 10); +template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last); +-
In case of invalid UTF-16 sequence, a
-utf8::invalid_utf16
exception is thrown.
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename
-octet_iterator> u16bit_iterator utf8to16 (octet_iterator start,
-octet_iterator end, u16bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert. < br /> end
: an
-iterator pointing to pass-the-end of the UTF-8 encoded string to
-convert.
-result
: an output iterator to the place in the UTF-16
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.
Example of use:
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
+
+ Example of use: +
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; - -utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - -assert (utf16result.size() == 4); -assert (utf16result[2] == 0xd834); -assert (utf16result[3] == 0xdd1e); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::distance(twochars, twochars + 5); +assert (dist == 2);-
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename
-u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start,
-u32bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-32 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-32 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ This function is used to find the length (in code points) of a UTF-8 encoded
+ string. The reason it is called distance, rather than, say,
+ length is mainly because developers are used that length is an
+ O(1) function. Computing the length of an UTF-8 string is a linear operation, and
+ it looked better to model it after std::distance
algorithm.
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If last
does not point to the past-of-end of a UTF-8 seqence,
+ a utf8::not_enough_room
exception is thrown.
+
+ Converts a UTF-16 encoded string to UTF-8. +
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; - -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - -assert (utf8result.size() == 9); +template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +-
In case of invalid UTF-32 string, a
-utf8::invalid_code_point
exception is thrown.
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename
-u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start,
-octet_iterator end, u32bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the UTF-8
-encoded string to convert.
-result
: an output iterator to the place in the UTF-32
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.
Example of use:
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; - -utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - -assert (utf32result.size() == 2); +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10);-
In case of an invalid UTF-8 seqence, a
-utf8::invalid_utf8
exception is thrown. If
-last
does not point to the past-of-end of a UTF-8
-seqence, a utf8::not_enough_room
exception is
-thrown.
Detects an invalid sequence within a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-find_invalid(octet_iterator start, octet_iterator end);
-start
: an iterator pointing to the beginning of the
-UTF-8 string to test for validity.
-end
: an iterator pointing to pass-the-end of the UTF-8
-string to test for validity.
-Return value: an iterator pointing to the first invalid
-octet in the UTF-8 string. In case none were found, equals
-end
.
Example of use:
+
+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16
exception is
+ thrown.
+
+ Converts an UTF-8 encoded string to UTF-16 +
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - -char* invalid = find_invalid(utf_invalid, utf_invalid + 6); - -assert (invalid == utf_invalid + 5); +template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +-
This function is typically used to make sure a UTF-8 string is -valid before processing it with other functions. It is especially -important to call it if before doing any of the unchecked -operations on it.
-Checks whether a sequence of octets is a valid UTF-8 string.
-template <typename octet_iterator> bool
-is_valid(octet_iterator start, octet_iterator end);
-start
: an iterator pointing to the beginning of the
-UTF-8 string to test for validity.
-end
: an iterator pointing to pass-the-end of the UTF-8
-string to test for validity.
-Return value: true
if the sequence is a valid
-UTF-8 string; false
if not.
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
+
+ Example of use: +
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - -bool bvalid = is_valid(utf_invalid, utf_invalid + 6); - +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); ++
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+ Converts a UTF-32 encoded string to UTF-8. +
++template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); + ++
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+ Example of use: +
++int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); ++
+ In case of invalid UTF-32 string, a utf8::invalid_code_point
exception
+ is thrown.
+
+ Converts a UTF-8 encoded string to UTF-32. +
++template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); + ++
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); ++
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+ Detects an invalid sequence within a UTF-8 string. +
++template <typename octet_iterator> +octet_iterator find_invalid(octet_iterator start, octet_iterator end); ++
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: an iterator pointing to the first
+ invalid octet in the UTF-8 string. In case none were found, equals
+ end
.
+
+ Example of use: +
++char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); +assert (invalid == utf_invalid + 5); ++
+ This function is typically used to make sure a UTF-8 string is valid before + processing it with other functions. It is especially important to call it if before + doing any of the unchecked operations on it. +
++ Checks whether a sequence of octets is a valid UTF-8 string. +
++template <typename octet_iterator> +bool is_valid(octet_iterator start, octet_iterator end); + ++
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: true
if the sequence
+ is a valid UTF-8 string; false
if not.
+
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); assert (bvalid == false);-
is_valid
is a shorthand for
-find_invalid(start, end) == end;
. You may want to use
-it to make sure that a byte seqence is a valid UTF-8 string without
-the need to know where it fails if it is not valid.
Checks whether a sequence of three octets is a UTF-8 byte order -mark (BOM)
-template <typename octet_iterator> bool is_bom
-(octet_iterator it);
-it
Beginning of the 3-octet sequence to check
-Return value: true
if the sequence is UTF-8
-byte order mark; false
if not.
Example of use:
+
+ is_valid
is a shorthand for find_invalid(start, end) ==
+ end;
. You may want to use it to make sure that a byte seqence is a valid
+ UTF-8 string without the need to know where it fails if it is not valid.
+
+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) +
-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; - -bool bbom = is_bom(byte_order_mark); - -assert (bbom == true); +template <typename octet_iterator> +bool is_bom (octet_iterator it);-
The typical use of this function is to check the first three -bytes of a file. If they form the UTF-8 BOM, we want to skip them -before processing the actual UTF-8 encoded text.
-Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.
-template <typename octet_iterator> octet_iterator
-append(uint32_t cp, octet_iterator result);
-cp
: A 32 bit integer representing a code point to
-append to the sequence.
-result
: An output iterator to the place in the
-sequence where to append the code point.
-Return value: An iterator pointing to the place after the
-newly appended sequence.
Example of use:
+
+ it
: beginning of the 3-octet sequence to check
+ Return value: true
if the sequence
+ is UTF-8 byte order mark; false
if not.
+
+ Example of use: +
-unsigned char u[5] = {0,0,0,0,0}; - -unsigned char* end = unchecked::append(0x0448, u); - -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = is_bom(byte_order_mark); +assert (bbom == true);-
This is a quicker but less safe version of
-utf8::append
. It does not check for validity of the
-supplied code point, and may produce an invalid UTF-8 sequence.
Given the iterator to the beginning of a UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.
-template <typename octet_iterator> uint32_t
-next(octet_iterator& it);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the beginning of the next
-code point.
-Return value: the 32 bit representation of the processed
-UTF-8 code point.
Example of use:
++ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +
++ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - -int cp = unchecked::next(w); - -assert (cp == 0x65e5); -assert (w == twochars + 3); +template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); +-
This is a quicker but less safe version of
-utf8::next
. It does not check for validity of the
-supplied UTF-8 sequence.
Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.
-template <typename octet_iterator> uint32_t
-previous(octet_iterator& it);
-it
: a reference pointing to an octet within a UTF-8
-encoded string. After the function returns, it is decremented to
-point to the beginning of the previous code point.
-Return value: the 32 bit representation of the previous code
-point.
Example of use:
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars + 3; - -int cp = unchecked::previous (w); - -assert (cp == 0x65e5); +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = unchecked::append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); ++
+ This is a quicker but less safe version of utf8::append
. It does not
+ check for validity of the supplied code point, and may produce an invalid UTF-8
+ sequence.
+
+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point + and moves the iterator to the next position. +
++template <typename octet_iterator> +uint32_t next(octet_iterator& it); + ++
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = unchecked::next(w); +assert (cp == 0x65e5); +assert (w == twochars + 3); ++
+ This is a quicker but less safe version of utf8::next
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +
++template <typename octet_iterator> +uint32_t previous(octet_iterator& it); + ++
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars + 3; +int cp = unchecked::previous (w); +assert (cp == 0x65e5); assert (w == twochars);-
This is a quicker but less safe version of
-utf8::previous
. It does not check for validity of the
-supplied UTF-8 sequence and offers no boundary checking.
Advances an iterator by the specified number of code points -within an UTF-8 sequence.
-template <typename octet_iterator, typename
-distance_type> void advance (octet_iterator& it,
-distance_type n);
-it
: a reference to an iterator pointing to the
-beginning of an UTF-8 encoded code point. After the function
-returns, it is incremented to point to the nth following code
-point.
-n
: a positive integer that shows how many code points
-we want to advance.
Example of use:
+
+ This is a quicker but less safe version of utf8::previous
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; - -unchecked::advance (w, 2); - -assert (w == twochars + 5); +template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n); +-
This function works only "forward". In case of a negative
-n
, there is no effect.
This is a quicker but less safe version of
-utf8::advance
. It does not check for validity of the
-supplied UTF-8 sequence and offers no boundary checking.
Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.
-template <typename octet_iterator> typename
-std::iterator_traits<octet_iterator>::difference_type
-distance (octet_iterator first, octet_iterator last);
-first
: an iterator to a beginning of a UTF-8
-encoded code point.
-last
: an iterator to a "post-end" of the last UTF-8
-encoded code point in the sequence we are trying to determine the
-length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code
-points.
Example of use:
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; - -size_t dist = utf8::unchecked::distance(twochars, twochars + 5); - -assert (dist == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +unchecked::advance (w, 2); +assert (w == twochars + 5);-
This is a quicker but less safe version of
-utf8::distance
. It does not check for validity of the
-supplied UTF-8 sequence.
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename
-octet_iterator> octet_iterator utf16to8 (u16bit_iterator start,
-u16bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-16 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-16 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
Example of use:
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+ This is a quicker but less safe version of utf8::advance
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; - -unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); - -assert (utf8result.size() == 10); +template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);-
This is a quicker but less safe version of
-utf8::utf16to8
. It does not check for validity of the
-supplied UTF-16 sequence.
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename
-octet_iterator> u16bit_iterator utf8to16 (octet_iterator start,
-octet_iterator end, u16bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert. < br /> end
: an
-iterator pointing to pass-the-end of the UTF-8 encoded string to
-convert.
-result
: an output iterator to the place in the UTF-16
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.
-
Example of use:
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
+
+ Example of use: +
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; - -unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); - -assert (utf16result.size() == 4); -assert (utf16result[2] == 0xd834); -assert (utf16result[3] == 0xdd1e); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::unchecked::distance(twochars, twochars + 5); +assert (dist == 2);-
This is a quicker but less safe version of
-utf8::utf8to16
. It does not check for validity of the
-supplied UTF-8 sequence.
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename
-u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start,
-u32bit_iterator end, octet_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-32 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the
-UTF-32 encoded string to convert.
-result
: an output iterator to the place in the UTF-8
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.
-
Example of use:
+
+ This is a quicker but less safe version of utf8::distance
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Converts a UTF-16 encoded string to UTF-8. +
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; - -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); - -assert (utf8result.size() == 9); +template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +-
This is a quicker but less safe version of
-utf8::utf32to8
. It does not check for validity of the
-supplied UTF-32 sequence.
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename
-u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start,
-octet_iterator end, u32bit_iterator result);
-start
: an iterator pointing to the beginning of the
-UTF-8 encoded string to convert.
-end
: an iterator pointing to pass-the-end of the UTF-8
-encoded string to convert.
-result
: an output iterator to the place in the UTF-32
-string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.
-
Example of use:
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+ Example of use: +
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; - -unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); - -assert (utf32result.size() == 2); +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10);-
This is a quicker but less safe version of
-utf8::utf8to32
. It does not check for validity of the
-supplied UTF-8 sequence.
The library was designed to be:
-In case you want to look into other means of working with UTF-8 -strings from C++, here is the list of solutions I am aware of:
-std::string
. If you prefer to have yet another
-string class in your code, it may be worth a look. Be aware of the
-licensing issues, though.Until Unicode becomes officially recognized by the C++ Standard -Library, we need to use other means to work with UTF-8 strings. -Template functions I describe in this article may be a good step in -this direction.
-
+ This is a quicker but less safe version of utf8::utf16to8
. It does not
+ check for validity of the supplied UTF-16 sequence.
+
+ Converts an UTF-8 encoded string to UTF-16 +
++template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); + ++
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
+
+ Example of use: +
++char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); ++
+ This is a quicker but less safe version of utf8::utf8to16
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ Converts a UTF-32 encoded string to UTF-8. +
++template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); + ++
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+ Example of use: +
++int utf32string[] = {0x448, 0x65e5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); ++
+ This is a quicker but less safe version of utf8::utf32to8
. It does not
+ check for validity of the supplied UTF-32 sequence.
+
+ Converts a UTF-8 encoded string to UTF-32. +
++template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); + ++
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
+
+ Example of use: +
++char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); ++
+ This is a quicker but less safe version of utf8::utf8to32
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+ The library was designed to be: +
++ In case you want to look into other means of working with UTF-8 strings from C++, + here is the list of solutions I am aware of: +
+std::string
. If you prefer to have yet another string class in your
+ code, it may be worth a look. Be aware of the licensing issues, though.
+ + Until Unicode becomes officially recognized by the C++ Standard Library, we need to + use other means to work with UTF-8 strings. Template functions I describe in this + article may be a good step in this direction. +
+@@ -164,25 +189,35 @@ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.
-
-template <typename octet_iterator>
+
+template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result);
-
+
+
cp
: A 32 bit integer representing a code point to append to the
sequence.
- result
: An output iterator to the place in the sequence where to
+ result
: An output iterator to the place in the sequence where to
append the code point.
- Return value: An iterator pointing to the place after the newly appended
- sequence.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
Example of use:
-unsigned char u[5] = {0,0,0,0,0}; -unsigned char* end = append(0x0448, u); -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
Note that append
does not allocate any memory - it is the burden of
@@ -202,26 +237,32 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3
Given the iterator to the beginning of the UTF-8 sequence, it returns the code
point and moves the iterator to the next position.
template <typename octet_iterator> uint32_t next(octet_iterator& it,
- octet_iterator end);
++template <typename octet_iterator> +uint32_t next(octet_iterator& it, octet_iterator end); + +
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.
- end
: end of the UTF-8 sequence to be processed. If it
+ end
: end of the UTF-8 sequence to be processed. If it
gets equal to end
during the extraction of a code point, an
utf8::not_enough_room
exception is thrown.
- Return value: the 32 bit representation of the processed UTF-8 code point.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; -int cp = next(w, twochars + 6); -assert (cp == 0x65e5); -assert (w == twochars + 3); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = next(w, twochars + 6); +assert (cp == 0x65e5); +assert (w == twochars + 3);
This function is typically used to iterate through a UTF-8 encoded string. @@ -238,26 +279,34 @@ assert (w == twochars + 3); decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.
-template <typename octet_iterator> uint32_t previous(octet_iterator&
- it, octet_iterator pass_start);
++template <typename octet_iterator> +uint32_t previous(octet_iterator& it, octet_iterator pass_start); + +
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
- pass_start
: an iterator to the point in the sequence where the search
+ pass_start
: an iterator to the point in the sequence where the search
for the beginning of a code point is aborted if no result was reached. It is a
safety measure to prevent passing the beginning of the string in the search for a
UTF-8 lead octet.
- Return value: the 32 bit representation of the previous code point.
+ Return value: the 32 bit representation of the
+ previous code point.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars + 3; -int cp = previous (w, twochars - 1); -assert (cp == 0x65e5); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars + 3; +int cp = previous (w, twochars - 1); +assert (cp == 0x65e5); assert (w == twochars);
@@ -280,15 +329,20 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.
-template <typename octet_iterator, typename distance_type> void advance
- (octet_iterator& it, distance_type n, octet_iterator end);
++template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n, octet_iterator end); + +
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.
- n
: a positive integer that shows how many code points we want to
+ n
: a positive integer that shows how many code points we want to
advance.
- end
: end of the UTF-8 sequence to be processed. If it
+ end
: end of the UTF-8 sequence to be processed. If it
gets equal to end
during the extraction of a code point, an
utf8::not_enough_room
exception is thrown.
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -unsigned char* w = twochars; -advance (w, 2, twochars + 6); -assert (w == twochars + 5); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars; +advance (w, 2, twochars + 6); +assert (w == twochars + 5);
This function works only "forward". In case of a negative n
, there is
@@ -316,23 +371,29 @@ assert (w == twochars + 5);
Given the iterators to two UTF-8 encoded code points in a seqence, returns the
number of code points between them.
template <typename octet_iterator> typename
- std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator
- first, octet_iterator last);
++template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last); + +
first
: an iterator to a beginning of a UTF-8 encoded code point.
- last
: an iterator to a "post-end" of the last UTF-8 encoded code point
- in the sequence we are trying to determine the length. It can be the beginning of a
- new code point, or not.
- Return value the distance between the iterators, in code points.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -size_t dist = utf8::distance(twochars, twochars + 5); -assert (dist == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::distance(twochars, twochars + 5); +assert (dist == 2);
This function is used to find the length (in code points) of a UTF-8 encoded @@ -352,27 +413,35 @@ assert (dist == 2);
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename octet_iterator>
- octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator
- result);
++template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-16 encoded string
- to convert.
- result
: an output iterator to the place in the UTF-8 string where to
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
Example of use:
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; -utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); -assert (utf8result.size() == 10); +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10);
In case of invalid UTF-16 sequence, a utf8::invalid_utf16
exception is
@@ -384,28 +453,35 @@ assert (utf8result.size() == 10);
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename octet_iterator>
- u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator
- result);
++template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. < br /> end
: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.
- result
: an output iterator to the place in the UTF-16 string where to
+ result
: an output iterator to the place in the UTF-16 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
Example of use:
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; -utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); -assert (utf16result.size() == 4); -assert (utf16result[2] == 0xd834); -assert (utf16result[3] == 0xdd1e); +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e);
In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
@@ -418,27 +494,33 @@ assert (utf16result[3] == 0xdd1e);
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename u32bit_iterator>
- octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator
- result);
++template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-32 encoded string
- to convert.
- result
: an output iterator to the place in the UTF-8 string where to
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
Example of use:
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); -assert (utf8result.size() == 9); +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9);
In case of invalid UTF-32 string, a utf8::invalid_code_point
exception
@@ -450,27 +532,33 @@ assert (utf8result.size() == 9);
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename u32bit_iterator>
- u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator
- result);
++template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
to convert.
- result
: an output iterator to the place in the UTF-32 string where to
+ result
: an output iterator to the place in the UTF-32 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; -utf8to32(twochars, twochars + 5, back_inserter(utf32result)); -assert (utf32result.size() == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2);
In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
@@ -483,23 +571,30 @@ assert (utf32result.size() == 2);
Detects an invalid sequence within a UTF-8 string.
-template <typename octet_iterator> octet_iterator
- find_invalid(octet_iterator start, octet_iterator end);
++template <typename octet_iterator> +octet_iterator find_invalid(octet_iterator start, octet_iterator end); +
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
- end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
for validity.
- Return value: an iterator pointing to the first invalid octet in the UTF-8
- string. In case none were found, equals end
.
+ Return value: an iterator pointing to the first
+ invalid octet in the UTF-8 string. In case none were found, equals
+ end
.
Example of use:
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -char* invalid = find_invalid(utf_invalid, utf_invalid + 6); -assert (invalid == utf_invalid + 5); +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); +assert (invalid == utf_invalid + 5);
This function is typically used to make sure a UTF-8 string is valid before @@ -512,20 +607,26 @@ assert (invalid == utf_invalid + 5);
Checks whether a sequence of octets is a valid UTF-8 string.
-template <typename octet_iterator> bool is_valid(octet_iterator start,
- octet_iterator end);
++template <typename octet_iterator> +bool is_valid(octet_iterator start, octet_iterator end); + +
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
- end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
for validity.
- Return value: true
if the sequence is a valid UTF-8 string;
- false
if not.
+ Return value: true
if the sequence
+ is a valid UTF-8 string; false
if not.
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -bool bvalid = is_valid(utf_invalid, utf_invalid + 6); +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); assert (bvalid == false);
@@ -539,38 +640,42 @@ assert (bvalid == false);
Replaces all invalid UTF-8 sequences within a string with a replacement marker.
-
- template <typename octet_iterator, typename output_iterator>
- output_iterator replace_invalid(octet_iterator start, octet_iterator end,
- output_iterator out, uint32_t replacement);
-
- template <typename octet_iterator, typename output_iterator>
- output_iterator replace_invalid(octet_iterator start, octet_iterator end,
- output_iterator out);
-
+template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); + +
start
: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.
- end
: an iterator pointing to pass-the-end of the UTF-8 string to look
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to look
for invalid UTF-8 sequences.
- out
: An output iterator to the range where the result of replacement
+ out
: An output iterator to the range where the result of replacement
is stored.
- replacement
: A Unicode code point for the replacement marker. The
+ replacement
: A Unicode code point for the replacement marker. The
version without this parameter assumes the value 0xfffd
- Return value: An iterator pointing to the place after the UTF-8 string with
- replaced invalid sequences.
+ Return value: An iterator pointing to the place
+ after the UTF-8 string with replaced invalid sequences.
Example of use:
-char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; -vector<char> replace_invalid_result; -replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; +replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); assert (bvalid); -char* fixed_invalid_sequence = "a????z"; +char* fixed_invalid_sequence = "a????z"; assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
@@ -589,20 +694,25 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
-template <typename octet_iterator> bool is_bom (octet_iterator
- it);
++template <typename octet_iterator> +bool is_bom (octet_iterator it); +
it
: beginning of the 3-octet sequence to check
- Return value: true
if the sequence is UTF-8 byte order mark;
- false
if not.
+ Return value: true
if the sequence
+ is UTF-8 byte order mark; false
if not.
Example of use:
-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; -bool bbom = is_bom(byte_order_mark); -assert (bbom == true); +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = is_bom(byte_order_mark); +assert (bbom == true);
The typical use of this function is to check the first three bytes of a file. If @@ -619,23 +729,35 @@ assert (bbom == true); Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.
-template <typename octet_iterator> octet_iterator append(uint32_t cp,
- octet_iterator result);
++template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); + +
cp
: A 32 bit integer representing a code point to append to the
sequence.
- result
: An output iterator to the place in the sequence where to
+ result
: An output iterator to the place in the sequence where to
append the code point.
- Return value: An iterator pointing to the place after the newly appended
- sequence.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
Example of use:
-unsigned char u[5] = {0,0,0,0,0}; -unsigned char* end = unchecked::append(0x0448, u); -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = unchecked::append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
This is a quicker but less safe version of utf8::append
. It does not
@@ -649,23 +771,29 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3
Given the iterator to the beginning of a UTF-8 sequence, it returns the code point
and moves the iterator to the next position.
template <typename octet_iterator> uint32_t next(octet_iterator&
- it);
++template <typename octet_iterator> +uint32_t next(octet_iterator& it); + +
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.
- Return value: the 32 bit representation of the processed UTF-8 code point.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; -int cp = unchecked::next(w); -assert (cp == 0x65e5); -assert (w == twochars + 3); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = unchecked::next(w); +assert (cp == 0x65e5); +assert (w == twochars + 3);
This is a quicker but less safe version of utf8::next
. It does not
@@ -679,22 +807,28 @@ assert (w == twochars + 3);
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
template <typename octet_iterator> uint32_t previous(octet_iterator&
- it);
++template <typename octet_iterator> +uint32_t previous(octet_iterator& it); + +
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
- Return value: the 32 bit representation of the previous code point.
+ Return value: the 32 bit representation of the
+ previous code point.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars + 3; -int cp = unchecked::previous (w); -assert (cp == 0x65e5); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars + 3; +int cp = unchecked::previous (w); +assert (cp == 0x65e5); assert (w == twochars);
@@ -708,23 +842,28 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.
-template <typename octet_iterator, typename distance_type> void advance
- (octet_iterator& it, distance_type n);
++template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n); + +
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.
- n
: a positive integer that shows how many code points we want to
+ n
: a positive integer that shows how many code points we want to
advance.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -char* w = twochars; -unchecked::advance (w, 2); -assert (w == twochars + 5); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +unchecked::advance (w, 2); +assert (w == twochars + 5);
This function works only "forward". In case of a negative n
, there is
@@ -741,23 +880,29 @@ assert (w == twochars + 5);
Given the iterators to two UTF-8 encoded code points in a seqence, returns the
number of code points between them.
template <typename octet_iterator> typename
- std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator
- first, octet_iterator last);
++template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last); +
first
: an iterator to a beginning of a UTF-8 encoded code point.
- last
: an iterator to a "post-end" of the last UTF-8 encoded code point
- in the sequence we are trying to determine the length. It can be the beginning of a
- new code point, or not.
- Return value the distance between the iterators, in code points.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -size_t dist = utf8::unchecked::distance(twochars, twochars + 5); -assert (dist == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::unchecked::distance(twochars, twochars + 5); +assert (dist == 2);
This is a quicker but less safe version of utf8::distance
. It does not
@@ -769,26 +914,35 @@ assert (dist == 2);
Converts a UTF-16 encoded string to UTF-8.
-template <typename u16bit_iterator, typename octet_iterator>
- octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator
- result);
++template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-16 encoded string
- to convert.
- result
: an output iterator to the place in the UTF-8 string where to
- append the result of conversion. Return value: An iterator pointing to the
- place after the appended UTF-8 string.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
Example of use:
-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector<unsigned char> utf8result; -unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); -assert (utf8result.size() == 10); +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10);
This is a quicker but less safe version of utf8::utf16to8
. It does not
@@ -800,28 +954,35 @@ assert (utf8result.size() == 10);
Converts an UTF-8 encoded string to UTF-16
-template <typename u16bit_iterator, typename octet_iterator>
- u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator
- result);
++template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. < br /> end
: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.
- result
: an output iterator to the place in the UTF-16 string where to
+ result
: an output iterator to the place in the UTF-16 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
Example of use:
-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector <unsigned short> utf16result; -unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); -assert (utf16result.size() == 4); -assert (utf16result[2] == 0xd834); -assert (utf16result[3] == 0xdd1e); +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e);
This is a quicker but less safe version of utf8::utf8to16
. It does not
@@ -833,27 +994,34 @@ assert (utf16result[3] == 0xdd1e);
Converts a UTF-32 encoded string to UTF-8.
-template <typename octet_iterator, typename u32bit_iterator>
- octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator
- result);
++template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-32 encoded string
- to convert.
- result
: an output iterator to the place in the UTF-8 string where to
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
Example of use:
-int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector<unsigned char> utf8result; -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); -assert (utf8result.size() == 9); +int utf32string[] = {0x448, 0x65e5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9);
This is a quicker but less safe version of utf8::utf32to8
. It does not
@@ -865,27 +1033,32 @@ assert (utf8result.size() == 9);
Converts a UTF-8 encoded string to UTF-32.
-template <typename octet_iterator, typename u32bit_iterator>
- u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator
- result);
++template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); + +
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.
- end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
to convert.
- result
: an output iterator to the place in the UTF-32 string where to
+ result
: an output iterator to the place in the UTF-32 string where to
append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32
- string.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
Example of use:
-char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector<int> utf32result; -unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); -assert (utf32result.size() == 2); +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2);
This is a quicker but less safe version of utf8::utf8to32
. It does not
diff --git a/v2_0/source/utf8/checked.h b/v2_0/source/utf8/checked.h
index 980be27..4647016 100644
--- a/v2_0/source/utf8/checked.h
+++ b/v2_0/source/utf8/checked.h
@@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE.
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
-#include