diff --git a/v1_0/doc/utf8cpp.html b/v1_0/doc/utf8cpp.html index d97edf4..b18a8ae 100644 --- a/v1_0/doc/utf8cpp.html +++ b/v1_0/doc/utf8cpp.html @@ -1,724 +1,1097 @@ - + - - - - -UTF8-CPP: UTF-8 with C++ in a Portable Way - - -

The Sourceforge project page

-

Table of Contents

- -

Introduction

-

Many C++ developers miss an easy and portable way of handling -Unicode encoded strings. C++ Standard is currently Unicode -agnostic, and while some work is being done to introduce Unicode to -the next incarnation called C++0x, for the moment nothing of the -sort is available. In the meantime, developers use 3rd party -libraries like ICU, OS specific capabilities, or simply roll out -their own solutions.

-

In order to easily handle UTF-8 encoded Unicode strings, I have -come up with a set of template functions. For anybody used to work -with STL algorithms, they should be easy and natural to use. The -code is freely available for any purpose - check out the license at -the beginning of the utf8.h file. Be aware, though, that while I -did some testing, this library has not been used in production yet. -If you run into bugs or performance issues, please let me know and -I'll do my best to address them.

-

The purpose of this article is not to offer an introduction to -Unicode in general, and UTF-8 in particular. If you are not -familiar with Unicode, be sure to check out Unicode Home Page or some other -source of information for Unicode. Also, it is not my aim to -advocate the use of UTF-8 encoded strings in C++ programs; if you -want to handle UTF-8 encoded strings from C++, I am sure you have -good reasons for it.

-

Examples of use

-

To illustrate the use of this utf8 library, we shall open a file -containing UTF-8 encoded text, check whether it starts with a byte order mark, -read each line into a std::string, check it for validity, convert the text to UTF-16, -and back to UTF-8:

+ + + + + + + UTF8-CPP: UTF-8 with C++ in a Portable Way + + + + +

+ The Sourceforge project page +

+
+

+ Table of Contents +

+ +
+

+ Introduction +

+

+ Many C++ developers miss an easy and portable way of handling Unicode encoded + strings. C++ Standard is currently Unicode agnostic, and while some work is being + done to introduce Unicode to the next incarnation called C++0x, for the moment + nothing of the sort is available. In the meantime, developers use 3rd party + libraries like ICU, OS specific capabilities, or simply roll out their own + solutions. +

+

+ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set + of template functions. For anybody used to work with STL algorithms, they should be + easy and natural to use. The code is freely available for any purpose - check out + the license at the beginning of the utf8.h file. Be aware, though, that while I did + some testing, this library has not been used in production yet. If you run into + bugs or performance issues, please let me know and I'll do my best to address them. +

+

+ The purpose of this article is not to offer an introduction to Unicode in general, + and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out + Unicode Home Page or some other source of + information for Unicode. Also, it is not my aim to advocate the use of UTF-8 + encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from + C++, I am sure you have good reasons for it. +

+

+ Examples of use +

+

+ To illustrate the use of this utf8 library, we shall open a file containing UTF-8 + encoded text, check whether it starts with a byte order mark, read each line into a + std::string, check it for validity, convert the text to UTF-16, and + back to UTF-8: +

-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-using namespace std;
-
-int main()
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+using namespace std;
+int main()
 {
-    if (argc != 2) {
-        cout << "\nUsage: docsample filename\n";
-        return 0;
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
     }
-    const char* test_file_path = argv[1];
-    // Open the test file (must be UTF-8 encoded)
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
     ifstream fs8(test_file_path);
-    if (!fs8.is_open()) {
-    cout << "Could not open " << test_file_path << endl;
-    return 0;
+    if (!fs8.is_open()) {
+    cout << "Could not open " << test_file_path << endl;
+    return 0;
     }
-
-    // Read the first line of the file
-    unsigned line_count = 1;
+    // Read the first line of the file
+    unsigned line_count = 1;
     string line;
-    if (!getline(fs8, line)) 
-        return 0;
-
-    // Look for utf-8 byte-order mark at the beginning
-    if (line.size() > 2) {
-        if (utf8::is_bom(line.c_str()))
-          cout << "There is a byte order mark at the beginning of the file\n";
+    if (!getline(fs8, line)) 
+        return 0;
+    // Look for utf-8 byte-order mark at the beginning
+    if (line.size() > 2) {
+        if (utf8::is_bom(line.c_str()))
+            cout << "There is a byte order mark at the beginning of the file\n";
     }
-
-    // Play with all the lines in the file
-    do {
+    // Play with all the lines in the file
+    do {
         // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
         string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
-        if (end_it != line.end()) {
-            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
-            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+        if (end_it != line.end()) {
+            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
         }
-        // Get the line length (at least for the valid part)
-        int length = utf8::distance(line.begin(), end_it);
-        cout << "Length of line " << line_count << " is " << length <<  "\n";
-
-        // Convert it to utf-16
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout << "Length of line " << line_count << " is " << length <<  "\n";
+        // Convert it to utf-16
         vector<unsigned short> utf16line;
         utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
-        // And back to utf-8;
+        // And back to utf-8
         string utf8line; 
         utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
-        // Confirm that the conversion went OK:
-        if (utf8line != string(line.begin(), end_it))
-            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
-
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
         getline(fs8, line);
         line_count++;
-    } while (!fs8.eof());
-
-    return 0;
+    } while (!fs8.eof());
+    return 0;
 }
 
-

In the previous code sample, we have seen the use of the following functions -from utf8 namespace: first we used is_bom -function to detect UTF-8 byte order mark at the beginning of the -file; then for each line we performed a detection of invalid UTF-8 sequences with find_invalid; -the number of characters (more precisely - the number of Unicode code points) in each line was determined -with a use of utf8::distance; finally, we have converted each line to UTF-16 encoding with -utf8to16 and back to UTF-8 with utf16to8. -

-

Reference

- -

Functions From utf8 Namespace

-

utf8::append

-

Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.

-template <typename octet_iterator> octet_iterator -append(uint32_t cp, octet_iterator result); -

cp: A 32 bit integer representing a code point to -append to the sequence.
-result: An output iterator to the place in the -sequence where to append the code point.
-Return value: An iterator pointing to the place after the -newly appended sequence.

-

Example of use:

+

+ In the previous code sample, we have seen the use of the following functions from + utf8 namespace: first we used is_bom function to detect + UTF-8 byte order mark at the beginning of the file; then for each line we performed + a detection of invalid UTF-8 sequences with find_invalid; the number + of characters (more precisely - the number of Unicode code points) in each line was + determined with a use of utf8::distance; finally, we have converted + each line to UTF-16 encoding with utf8to16 and back to UTF-8 with + utf16to8. +

+

+ Reference +

+

+ Functions From utf8 Namespace +

+

+ utf8::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

-unsigned char u[5] = {0,0,0,0,0};
-
-unsigned char* end = append(0x0448, u);
-
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
 
-

Note that append does not allocate any memory - it -is the burden of the caller to make sure there is enough memory -allocated for the operation. To make things more interesting, -append can add anywhere between 1 and 4 octets to the -sequence. In practice, you would most often want to use -std::back_inserter to ensure that the necessary memory -is allocated.

-

In case of an invalid code point, a -utf8::invalid_code_point exception is thrown.

-

utf8::next

-

Given the iterator to the beginning of the UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.

-template <typename octet_iterator> uint32_t -next(octet_iterator& it, octet_iterator end); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the beginning of the next -code point.
-end: end of the UTF-8 sequence to be processed. If -it gets equal to end during the -extraction of a code point, an utf8::not_enough_room -exception is thrown.
-Return value: the 32 bit representation of the processed -UTF-8 code point.

-

Example of use:

+

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place + after the newly appended sequence. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
-int cp = next(w, twochars + 6);
-
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 
-

This function is typically used to iterate through a UTF-8 -encoded string.

-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown.

-

utf8::previous

-

Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.

-template <typename octet_iterator> uint32_t -previous(octet_iterator& it, octet_iterator pass_start); -

it: a reference pointing to an octet within a UTF-8 -encoded string. After the function returns, it is decremented to -point to the beginning of the previous code point.
-pass_start: an iterator to the point in the sequence -where the search for the beginning of a code point is aborted if no -result was reached. It is a safety measure to prevent passing the -beginning of the string in the search for a UTF-8 lead octet.
-Return value: the 32 bit representation of the previous code -point.

-

Example of use:

+

+ Note that append does not allocate any memory - it is the burden of + the caller to make sure there is enough memory allocated for the operation. To make + things more interesting, append can add anywhere between 1 and 4 + octets to the sequence. In practice, you would most often want to use + std::back_inserter to ensure that the necessary memory is allocated. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::next +

+

+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point and moves the iterator to the next position. +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars + 3;
-
-int cp = previous (w, twochars - 1);
-
-assert (cp == 0x65e5);
+template <typename octet_iterator> 
+uint32_t next(octet_iterator& it, octet_iterator end);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+

+ This function is typically used to iterate through a UTF-8 encoded string. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. +

+

+ utf8::previous +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator> 
+uint32_t previous(octet_iterator& it, octet_iterator pass_start);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ pass_start: an iterator to the point in the sequence where the search + for the beginning of a code point is aborted if no result was reached. It is a + safety measure to prevent passing the beginning of the string in the search for a + UTF-8 lead octet.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = previous (w, twochars - 1);
+assert (cp == 0x65e5);
 assert (w == twochars);
 
-

The primary purpose of this function is to iterate backwards -through a UTF-8 encoded string. Therefore, it will -typically point to the beginning of a code point, and -pass_start will point to the octet just before the -beginning of the string to ensure we don't go backwards too far. -it is decreased until it points to a lead UTF-8 octet, -and then the UTF-8 sequence beginning with that octet is decoded to -a 32 bit representation and returned.

-

In case pass_end is reached before a UTF-8 lead -octet is hit, or if an invalid UTF-8 sequence is started by the -lead octet, an invalid_utf8 exception is thrown

-

utf8::advance

-

Advances an iterator by the specified number of code points -within an UTF-8 sequence.

-template <typename octet_iterator, typename -distance_type> void advance (octet_iterator& it, -distance_type n, octet_iterator end); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the nth following code -point.
-n: a positive integer that shows how many code points -we want to advance.
-end: end of the UTF-8 sequence to be processed. If -it gets equal to end during the -extraction of a code point, an utf8::not_enough_room -exception is thrown.

-

Example of use:

+

+ The primary purpose of this function is to iterate backwards through a UTF-8 + encoded string. Therefore, it will typically point to the beginning of + a code point, and pass_start will point to the octet just before the + beginning of the string to ensure we don't go backwards too far. it is + decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence + beginning with that octet is decoded to a 32 bit representation and returned. +

+

+ In case pass_end is reached before a UTF-8 lead octet is hit, or if an + invalid UTF-8 sequence is started by the lead octet, an invalid_utf8 + exception is thrown +

+

+ utf8::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars;
-
-advance (w, 2, twochars + 6);
-
-assert (w == twochars + 5);
+template <typename octet_iterator, typename distance_type> 
+void advance (octet_iterator& it, distance_type n, octet_iterator end);
+   
 
-

This function works only "forward". In case of a negative -n, there is no effect.

-

In case of an invalid code point, a -utf8::invalid_code_point exception is thrown.

-

utf8::distance

-

Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.

-template <typename octet_iterator> typename -std::iterator_traits<octet_iterator>::difference_type -distance (octet_iterator first, octet_iterator last); -

first: an iterator to a beginning of a UTF-8 -encoded code point.
-last: an iterator to a "post-end" of the last UTF-8 -encoded code point in the sequence we are trying to determine the -length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code -points.

-

Example of use:

+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
-size_t dist = utf8::distance(twochars, twochars + 5);
-
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
+advance (w, 2, twochars + 6);
+assert (w == twochars + 5);
 
-

This function is used to find the length (in code points) of a -UTF-8 encoded string. The reason it is called distance, -rather than, say, length is mainly because developers are -used that length is an O(1) function. Computing the length -of an UTF-8 string is a linear operation, and it looked better to -model it after std::distance algorithm.

-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::utf16to8

-

Converts a UTF-16 encoded string to UTF-8.

-template <typename u16bit_iterator, typename -octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, -u16bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-16 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-16 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-
-utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-
-assert (utf8result.size() == 10);    
+template <typename octet_iterator> 
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+   
 
-

In case of invalid UTF-16 sequence, a -utf8::invalid_utf16 exception is thrown.

-

utf8::utf8to16

-

Converts an UTF-8 encoded string to UTF-16

-template <typename u16bit_iterator, typename -octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, -octet_iterator end, u16bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert. < br /> end: an -iterator pointing to pass-the-end of the UTF-8 encoded string to -convert.
-result: an output iterator to the place in the UTF-16 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.

-

Example of use:

+

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points. +

+

+ Example of use: +

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-
-utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::distance(twochars, twochars + 5);
+assert (dist == 2);
 
-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::utf32to8

-

Converts a UTF-32 encoded string to UTF-8.

-template <typename octet_iterator, typename -u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start, -u32bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-32 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-32 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ This function is used to find the length (in code points) of a UTF-8 encoded + string. The reason it is called distance, rather than, say, + length is mainly because developers are used that length is an + O(1) function. Computing the length of an UTF-8 string is a linear operation, and + it looked better to model it after std::distance algorithm. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If last does not point to the past-of-end of a UTF-8 seqence, + a utf8::not_enough_room exception is thrown. +

+

+ utf8::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-
-assert (utf8result.size() == 9);
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
 
-

In case of invalid UTF-32 string, a -utf8::invalid_code_point exception is thrown.

-

utf8::utf8to32

-

Converts a UTF-8 encoded string to UTF-32.

-template <typename octet_iterator, typename -u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start, -octet_iterator end, u32bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert.
-end: an iterator pointing to pass-the-end of the UTF-8 -encoded string to convert.
-result: an output iterator to the place in the UTF-32 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.

-

Example of use:

+

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-
-utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-
-assert (utf32result.size() == 2);
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 
-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::find_invalid

-

Detects an invalid sequence within a UTF-8 string.

-template <typename octet_iterator> octet_iterator -find_invalid(octet_iterator start, octet_iterator end); -

start: an iterator pointing to the beginning of the -UTF-8 string to test for validity.
-end: an iterator pointing to pass-the-end of the UTF-8 -string to test for validity.
-Return value: an iterator pointing to the first invalid -octet in the UTF-8 string. In case none were found, equals -end.

-

Example of use:

+

+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16 exception is + thrown. +

+

+ utf8::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-
-char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-
-assert (invalid == utf_invalid + 5);
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
 
-

This function is typically used to make sure a UTF-8 string is -valid before processing it with other functions. It is especially -important to call it if before doing any of the unchecked -operations on it.

-

utf8::is_valid

-

Checks whether a sequence of octets is a valid UTF-8 string.

-template <typename octet_iterator> bool -is_valid(octet_iterator start, octet_iterator end); -

start: an iterator pointing to the beginning of the -UTF-8 string to test for validity.
-end: an iterator pointing to pass-the-end of the UTF-8 -string to test for validity.
-Return value: true if the sequence is a valid -UTF-8 string; false if not.

-Example of use: +

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-16 string. +

+

+ Example of use: +

-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-
-bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
-
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+

+ In case of invalid UTF-32 string, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-32 string. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::find_invalid +

+

+ Detects an invalid sequence within a UTF-8 string. +

+
+template <typename octet_iterator> 
+octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+
+

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: an iterator pointing to the first + invalid octet in the UTF-8 string. In case none were found, equals + end. +

+

+ Example of use: +

+
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+assert (invalid == utf_invalid + 5);
+
+

+ This function is typically used to make sure a UTF-8 string is valid before + processing it with other functions. It is especially important to call it if before + doing any of the unchecked operations on it. +

+

+ utf8::is_valid +

+

+ Checks whether a sequence of octets is a valid UTF-8 string. +

+
+template <typename octet_iterator> 
+bool is_valid(octet_iterator start, octet_iterator end);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: true if the sequence + is a valid UTF-8 string; false if not. +

+ Example of use: +
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
 assert (bvalid == false);
 
-

is_valid is a shorthand for -find_invalid(start, end) == end;. You may want to use -it to make sure that a byte seqence is a valid UTF-8 string without -the need to know where it fails if it is not valid.

-

utf8::is_bom

-

Checks whether a sequence of three octets is a UTF-8 byte order -mark (BOM)

-template <typename octet_iterator> bool is_bom -(octet_iterator it); -

it Beginning of the 3-octet sequence to check
-Return value: true if the sequence is UTF-8 -byte order mark; false if not.

-

Example of use:

+

+ is_valid is a shorthand for find_invalid(start, end) == + end;. You may want to use it to make sure that a byte seqence is a valid + UTF-8 string without the need to know where it fails if it is not valid. +

+

+ utf8::is_bom +

+

+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) +

-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-
-bool bbom = is_bom(byte_order_mark);
-
-assert (bbom == true);
+template <typename octet_iterator> 
+bool is_bom (octet_iterator it);
 
-

The typical use of this function is to check the first three -bytes of a file. If they form the UTF-8 BOM, we want to skip them -before processing the actual UTF-8 encoded text.

-

Functions From utf8::unchecked Namespace

-

utf8::unchecked::append

-

Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.

-template <typename octet_iterator> octet_iterator -append(uint32_t cp, octet_iterator result); -

cp: A 32 bit integer representing a code point to -append to the sequence.
-result: An output iterator to the place in the -sequence where to append the code point.
-Return value: An iterator pointing to the place after the -newly appended sequence.

-

Example of use:

+

+ it: beginning of the 3-octet sequence to check
+ Return value: true if the sequence + is UTF-8 byte order mark; false if not. +

+

+ Example of use: +

-unsigned char u[5] = {0,0,0,0,0};
-
-unsigned char* end = unchecked::append(0x0448, u);
-
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
+assert (bbom == true);
 
-

This is a quicker but less safe version of -utf8::append. It does not check for validity of the -supplied code point, and may produce an invalid UTF-8 sequence.

-

utf8::unchecked::next

-

Given the iterator to the beginning of a UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.

-template <typename octet_iterator> uint32_t -next(octet_iterator& it); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the beginning of the next -code point.
-Return value: the 32 bit representation of the processed -UTF-8 code point.

-

Example of use:

+

+ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +

+

+ Functions From utf8::unchecked Namespace +

+

+ utf8::unchecked::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
-int cp = unchecked::next(w);
-
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
 
-

This is a quicker but less safe version of -utf8::next. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::previous

-

Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.

-template <typename octet_iterator> uint32_t -previous(octet_iterator& it); -

it: a reference pointing to an octet within a UTF-8 -encoded string. After the function returns, it is decremented to -point to the beginning of the previous code point.
-Return value: the 32 bit representation of the previous code -point.

-

Example of use:

+

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place + after the newly appended sequence. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars + 3;
-
-int cp = unchecked::previous (w);
-
-assert (cp == 0x65e5);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+

+ This is a quicker but less safe version of utf8::append. It does not + check for validity of the supplied code point, and may produce an invalid UTF-8 + sequence. +

+

+ utf8::unchecked::next +

+

+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point + and moves the iterator to the next position. +

+
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+

+ This is a quicker but less safe version of utf8::next. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::previous +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator>
+uint32_t previous(octet_iterator& it);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::previous (w);
+assert (cp == 0x65e5);
 assert (w == twochars);
 
-

This is a quicker but less safe version of -utf8::previous. It does not check for validity of the -supplied UTF-8 sequence and offers no boundary checking.

-

utf8::unchecked::advance

-

Advances an iterator by the specified number of code points -within an UTF-8 sequence.

-template <typename octet_iterator, typename -distance_type> void advance (octet_iterator& it, -distance_type n); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the nth following code -point.
-n: a positive integer that shows how many code points -we want to advance.

-

Example of use:

+

+ This is a quicker but less safe version of utf8::previous. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
-unchecked::advance (w, 2);
-
-assert (w == twochars + 5);
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n);
+   
 
-

This function works only "forward". In case of a negative -n, there is no effect.

-

This is a quicker but less safe version of -utf8::advance. It does not check for validity of the -supplied UTF-8 sequence and offers no boundary checking.

-

utf8::unchecked::distance

-

Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.

-template <typename octet_iterator> typename -std::iterator_traits<octet_iterator>::difference_type -distance (octet_iterator first, octet_iterator last); -

first: an iterator to a beginning of a UTF-8 -encoded code point.
-last: an iterator to a "post-end" of the last UTF-8 -encoded code point in the sequence we are trying to determine the -length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code -points.

-

Example of use:

+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
-size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
-
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+unchecked::advance (w, 2);
+assert (w == twochars + 5);
 
-

This is a quicker but less safe version of -utf8::distance. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::utf16to8

-

Converts a UTF-16 encoded string to UTF-8.

-template <typename u16bit_iterator, typename -octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, -u16bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-16 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-16 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ This is a quicker but less safe version of utf8::advance. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-
-unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-
-assert (utf8result.size() == 10);    
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
 
-

This is a quicker but less safe version of -utf8::utf16to8. It does not check for validity of the -supplied UTF-16 sequence.

-

utf8::unchecked::utf8to16

-

Converts an UTF-8 encoded string to UTF-16

-template <typename u16bit_iterator, typename -octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, -octet_iterator end, u16bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert. < br /> end: an -iterator pointing to pass-the-end of the UTF-8 encoded string to -convert.
-result: an output iterator to the place in the UTF-16 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string. -

- -

Example of use:

+

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points. +

+

+ Example of use: +

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-
-unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
+assert (dist == 2);
 
-

This is a quicker but less safe version of -utf8::utf8to16. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::utf32to8

-

Converts a UTF-32 encoded string to UTF-8.

-template <typename octet_iterator, typename -u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start, -u32bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-32 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-32 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string. -

-

Example of use:

+

+ This is a quicker but less safe version of utf8::distance. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-
-assert (utf8result.size() == 9);
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
 
-

This is a quicker but less safe version of -utf8::utf32to8. It does not check for validity of the -supplied UTF-32 sequence.

-

utf8::unchecked::utf8to32

-

Converts a UTF-8 encoded string to UTF-32.

-template <typename octet_iterator, typename -u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start, -octet_iterator end, u32bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert.
-end: an iterator pointing to pass-the-end of the UTF-8 -encoded string to convert.
-result: an output iterator to the place in the UTF-32 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string. -

-

Example of use:

+

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-
-unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-
-assert (utf32result.size() == 2);
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 
-

This is a quicker but less safe version of -utf8::utf8to32. It does not check for validity of the -supplied UTF-8 sequence.

-

Points of interest

-

Design goals and decisions

-

The library was designed to be:

-
    -
  1. Generic: for better or worse, there are many C++ string classes -out there, and the library should work with as many of them as -possible.
  2. -
  3. Portable: the library should be portable both accross different -platforms and compilers. The only non-portable code is a small -section that declares unsigned integers of different sizes: three -typedefs. They can be changed by the users of the library if they -don't match their platform. The default setting should work for -Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix -derivatives.
  4. -
  5. Lightweight: follow the "pay only for what you use" -guidline.
  6. -
  7. Unintrusive: avoid forcing any particular design or even -programming style on the user. This is a library, not a -framework.
  8. -
-

Alternatives

-

In case you want to look into other means of working with UTF-8 -strings from C++, here is the list of solutions I am aware of:

-
    -
  1. ICU Library. It is -very powerful, complete, feature-rich, mature, and widely used. -Also big, intrusive, non-generic, and doesn't play well with the -Standard Library. I definitelly recommend looking at ICU even if -you don't plan to use it.
  2. -
  3. Glib::ustring. -A class specifically made to work with UTF-8 strings, and also feel -like std::string. If you prefer to have yet another -string class in your code, it may be worth a look. Be aware of the -licensing issues, though.
  4. -
  5. Platform dependent solutions: Windows and POSIX have functions -to convert strings from one encoding to another. That is only a -subset of what my library offers, but if that is all you need it -may be good enough, especially given the fact that these functions -are mature and tested in production.
  6. -
-

Conclusion

-

Until Unicode becomes officially recognized by the C++ Standard -Library, we need to use other means to work with UTF-8 strings. -Template functions I describe in this article may be a good step in -this direction.

-

References

-
    -
  1. The Unicode -Consortium.
  2. -
  3. ICU Library.
  4. -
  5. UTF-8 at -Wikipedia
  6. -
  7. UTF-8 and Unicode FAQ for Unix/Linux
  8. -
- +

+ This is a quicker but less safe version of utf8::utf16to8. It does not + check for validity of the supplied UTF-16 sequence. +

+

+ utf8::unchecked::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

+
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-16 string. +

+

+ Example of use: +

+
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+

+ This is a quicker but less safe version of utf8::utf8to16. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+int utf32string[] = {0x448, 0x65e5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+

+ This is a quicker but less safe version of utf8::utf32to8. It does not + check for validity of the supplied UTF-32 sequence. +

+

+ utf8::unchecked::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-32 string. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+

+ This is a quicker but less safe version of utf8::utf8to32. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ Points of interest +

+

+ Design goals and decisions +

+

+ The library was designed to be: +

+
    +
  1. + Generic: for better or worse, there are many C++ string classes out there, and + the library should work with as many of them as possible. +
  2. +
  3. + Portable: the library should be portable both accross different platforms and + compilers. The only non-portable code is a small section that declares unsigned + integers of different sizes: three typedefs. They can be changed by the users of + the library if they don't match their platform. The default setting should work + for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. +
  4. +
  5. + Lightweight: follow the "pay only for what you use" guidline. +
  6. +
  7. + Unintrusive: avoid forcing any particular design or even programming style on the + user. This is a library, not a framework. +
  8. +
+

+ Alternatives +

+

+ In case you want to look into other means of working with UTF-8 strings from C++, + here is the list of solutions I am aware of: +

+
    +
  1. + ICU Library. It is very powerful, + complete, feature-rich, mature, and widely used. Also big, intrusive, + non-generic, and doesn't play well with the Standard Library. I definitelly + recommend looking at ICU even if you don't plan to use it. +
  2. +
  3. + Glib::ustring. + A class specifically made to work with UTF-8 strings, and also feel like + std::string. If you prefer to have yet another string class in your + code, it may be worth a look. Be aware of the licensing issues, though. +
  4. +
  5. + Platform dependent solutions: Windows and POSIX have functions to convert strings + from one encoding to another. That is only a subset of what my library offers, + but if that is all you need it may be good enough, especially given the fact that + these functions are mature and tested in production. +
  6. +
+

+ Conclusion +

+

+ Until Unicode becomes officially recognized by the C++ Standard Library, we need to + use other means to work with UTF-8 strings. Template functions I describe in this + article may be a good step in this direction. +

+

+ References +

+
    +
  1. + The Unicode Consortium. +
  2. +
  3. + ICU Library. +
  4. +
  5. + UTF-8 at Wikipedia +
  6. +
  7. + UTF-8 and Unicode FAQ for + Unix/Linux +
  8. +
+ diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html index c915572..1228c49 100644 --- a/v2_0/doc/utf8cpp.html +++ b/v2_0/doc/utf8cpp.html @@ -6,50 +6,65 @@ + UTF8-CPP: UTF-8 with C++ in a Portable Way +

The Sourceforge project page

-

- Table of Contents -

- +
+

+ Table of Contents +

+ +

Introduction

@@ -91,54 +106,64 @@ #include <iostream> #include <string> #include <vector> -using namespace std; -int main() +using namespace std; +int main() { - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; } - const char* test_file_path = argv[1]; - // Open the test file (must be UTF-8 encoded) + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) ifstream fs8(test_file_path); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 0; + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; } - // Read the first line of the file - unsigned line_count = 1; + // Read the first line of the file + unsigned line_count = 1; string line; - if (!getline(fs8, line)) - return 0; - // Look for utf-8 byte-order mark at the beginning - if (line.size() > 2) { - if (utf8::is_bom(line.c_str())) - cout << "There is a byte order mark at the beginning of the file\n"; + if (!getline(fs8, line)) + return 0; + // Look for utf-8 byte-order mark at the beginning + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; } - // Play with all the lines in the file - do { + // Play with all the lines in the file + do { // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); - if (end_it != line.end()) { - cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; - cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; } - // Get the line length (at least for the valid part) - int length = utf8::distance(line.begin(), end_it); - cout << "Length of line " << line_count << " is " << length << "\n"; - // Convert it to utf-16 + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + // Convert it to utf-16 vector<unsigned short> utf16line; utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); - // And back to utf-8; + // And back to utf-8 string utf8line; utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); - // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) - cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; getline(fs8, line); line_count++; - } while (!fs8.eof()); - return 0; + } while (!fs8.eof()); + return 0; }

@@ -164,25 +189,35 @@ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

- -template <typename octet_iterator> +
+template <typename octet_iterator>
 octet_iterator append(uint32_t cp, octet_iterator result);
-     
+   
+

cp: A 32 bit integer representing a code point to append to the sequence.
- result: An output iterator to the place in the sequence where to + result: An output iterator to the place in the sequence where to append the code point.
- Return value: An iterator pointing to the place after the newly appended - sequence. + Return value: An iterator pointing to the place + after the newly appended sequence.

Example of use:

-unsigned char u[5] = {0,0,0,0,0};
-unsigned char* end = append(0x0448, u);
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 

Note that append does not allocate any memory - it is the burden of @@ -202,26 +237,32 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3 Given the iterator to the beginning of the UTF-8 sequence, it returns the code point and moves the iterator to the next position.

- template <typename octet_iterator> uint32_t next(octet_iterator& it, - octet_iterator end); +
+template <typename octet_iterator> 
+uint32_t next(octet_iterator& it, octet_iterator end);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
- end: end of the UTF-8 sequence to be processed. If it + end: end of the UTF-8 sequence to be processed. If it gets equal to end during the extraction of a code point, an utf8::not_enough_room exception is thrown.
- Return value: the 32 bit representation of the processed UTF-8 code point. + Return value: the 32 bit representation of the + processed UTF-8 code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-int cp = next(w, twochars + 6);
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
 

This function is typically used to iterate through a UTF-8 encoded string. @@ -238,26 +279,34 @@ assert (w == twochars + 3); decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

- template <typename octet_iterator> uint32_t previous(octet_iterator& - it, octet_iterator pass_start); +
+template <typename octet_iterator> 
+uint32_t previous(octet_iterator& it, octet_iterator pass_start);
+   
+

it: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
- pass_start: an iterator to the point in the sequence where the search + pass_start: an iterator to the point in the sequence where the search for the beginning of a code point is aborted if no result was reached. It is a safety measure to prevent passing the beginning of the string in the search for a UTF-8 lead octet.
- Return value: the 32 bit representation of the previous code point. + Return value: the 32 bit representation of the + previous code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars + 3;
-int cp = previous (w, twochars - 1);
-assert (cp == 0x65e5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = previous (w, twochars - 1);
+assert (cp == 0x65e5);
 assert (w == twochars);
 

@@ -280,15 +329,20 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.

- template <typename octet_iterator, typename distance_type> void advance - (octet_iterator& it, distance_type n, octet_iterator end); +
+template <typename octet_iterator, typename distance_type> 
+void advance (octet_iterator& it, distance_type n, octet_iterator end);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point.
- n: a positive integer that shows how many code points we want to + n: a positive integer that shows how many code points we want to advance.
- end: end of the UTF-8 sequence to be processed. If it + end: end of the UTF-8 sequence to be processed. If it gets equal to end during the extraction of a code point, an utf8::not_enough_room exception is thrown.

@@ -296,10 +350,11 @@ assert (w == twochars); Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars;
-advance (w, 2, twochars + 6);
-assert (w == twochars + 5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
+advance (w, 2, twochars + 6);
+assert (w == twochars + 5);
 

This function works only "forward". In case of a negative n, there is @@ -316,23 +371,29 @@ assert (w == twochars + 5); Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them.

- template <typename octet_iterator> typename - std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator - first, octet_iterator last); +
+template <typename octet_iterator> 
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+   
+

first: an iterator to a beginning of a UTF-8 encoded code point.
- last: an iterator to a "post-end" of the last UTF-8 encoded code point - in the sequence we are trying to determine the length. It can be the beginning of a - new code point, or not.
- Return value the distance between the iterators, in code points. + last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-size_t dist = utf8::distance(twochars, twochars + 5);
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::distance(twochars, twochars + 5);
+assert (dist == 2);
 

This function is used to find the length (in code points) of a UTF-8 encoded @@ -352,27 +413,35 @@ assert (dist == 2);

Converts a UTF-16 encoded string to UTF-8.

- template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-16 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-16 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-assert (utf8result.size() == 10);    
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 

In case of invalid UTF-16 sequence, a utf8::invalid_utf16 exception is @@ -384,28 +453,35 @@ assert (utf8result.size() == 10);

Converts an UTF-8 encoded string to UTF-16

- template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-16 string where to + result: an output iterator to the place in the UTF-16 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16 - string. + Return value: An iterator pointing to the place + after the appended UTF-16 string.

Example of use:

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
 

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is @@ -418,27 +494,33 @@ assert (utf16result[3] == 0xdd1e);

Converts a UTF-32 encoded string to UTF-8.

- template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-32 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-32 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-assert (utf8result.size() == 9);
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
 

In case of invalid UTF-32 string, a utf8::invalid_code_point exception @@ -450,27 +532,33 @@ assert (utf8result.size() == 9);

Converts a UTF-8 encoded string to UTF-32.

- template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-8 encoded string + end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-32 string where to + result: an output iterator to the place in the UTF-32 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32 - string. + Return value: An iterator pointing to the place + after the appended UTF-32 string.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-assert (utf32result.size() == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
 

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is @@ -483,23 +571,30 @@ assert (utf32result.size() == 2);

Detects an invalid sequence within a UTF-8 string.

- template <typename octet_iterator> octet_iterator - find_invalid(octet_iterator start, octet_iterator end); +
+template <typename octet_iterator> 
+octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+

start: an iterator pointing to the beginning of the UTF-8 string to test for validity.
- end: an iterator pointing to pass-the-end of the UTF-8 string to test + end: an iterator pointing to pass-the-end of the UTF-8 string to test for validity.
- Return value: an iterator pointing to the first invalid octet in the UTF-8 - string. In case none were found, equals end. + Return value: an iterator pointing to the first + invalid octet in the UTF-8 string. In case none were found, equals + end.

Example of use:

-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-assert (invalid == utf_invalid + 5);
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+assert (invalid == utf_invalid + 5);
 

This function is typically used to make sure a UTF-8 string is valid before @@ -512,20 +607,26 @@ assert (invalid == utf_invalid + 5);

Checks whether a sequence of octets is a valid UTF-8 string.

- template <typename octet_iterator> bool is_valid(octet_iterator start, - octet_iterator end); +
+template <typename octet_iterator> 
+bool is_valid(octet_iterator start, octet_iterator end);
+   
+

start: an iterator pointing to the beginning of the UTF-8 string to test for validity.
- end: an iterator pointing to pass-the-end of the UTF-8 string to test + end: an iterator pointing to pass-the-end of the UTF-8 string to test for validity.
- Return value: true if the sequence is a valid UTF-8 string; - false if not. + Return value: true if the sequence + is a valid UTF-8 string; false if not.

Example of use:
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
 assert (bvalid == false);
 

@@ -539,38 +640,42 @@ assert (bvalid == false);

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

-

- template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, - output_iterator out, uint32_t replacement); -

-

- template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, - output_iterator out); -

+
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+   
+

start: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences.
- end: an iterator pointing to pass-the-end of the UTF-8 string to look + end: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences.
- out: An output iterator to the range where the result of replacement + out: An output iterator to the range where the result of replacement is stored.
- replacement: A Unicode code point for the replacement marker. The + replacement: A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
- Return value: An iterator pointing to the place after the UTF-8 string with - replaced invalid sequences. + Return value: An iterator pointing to the place + after the UTF-8 string with replaced invalid sequences.

Example of use:

-char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-vector<char> replace_invalid_result;
-replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
 bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
 assert (bvalid);
-char* fixed_invalid_sequence = "a????z";
+char* fixed_invalid_sequence = "a????z";
 assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
 

@@ -589,20 +694,25 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),

Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)

- template <typename octet_iterator> bool is_bom (octet_iterator - it); +
+template <typename octet_iterator> 
+bool is_bom (octet_iterator it);
+

it: beginning of the 3-octet sequence to check
- Return value: true if the sequence is UTF-8 byte order mark; - false if not. + Return value: true if the sequence + is UTF-8 byte order mark; false if not.

Example of use:

-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-bool bbom = is_bom(byte_order_mark);
-assert (bbom == true);
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
+assert (bbom == true);
 

The typical use of this function is to check the first three bytes of a file. If @@ -619,23 +729,35 @@ assert (bbom == true); Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

- template <typename octet_iterator> octet_iterator append(uint32_t cp, - octet_iterator result); +
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
+

cp: A 32 bit integer representing a code point to append to the sequence.
- result: An output iterator to the place in the sequence where to + result: An output iterator to the place in the sequence where to append the code point.
- Return value: An iterator pointing to the place after the newly appended - sequence. + Return value: An iterator pointing to the place + after the newly appended sequence.

Example of use:

-unsigned char u[5] = {0,0,0,0,0};
-unsigned char* end = unchecked::append(0x0448, u);
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 

This is a quicker but less safe version of utf8::append. It does not @@ -649,23 +771,29 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3 Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position.

- template <typename octet_iterator> uint32_t next(octet_iterator& - it); +
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
- Return value: the 32 bit representation of the processed UTF-8 code point. + Return value: the 32 bit representation of the + processed UTF-8 code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-int cp = unchecked::next(w);
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
 

This is a quicker but less safe version of utf8::next. It does not @@ -679,22 +807,28 @@ assert (w == twochars + 3); decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

- template <typename octet_iterator> uint32_t previous(octet_iterator& - it); +
+template <typename octet_iterator>
+uint32_t previous(octet_iterator& it);
+   
+

it: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
- Return value: the 32 bit representation of the previous code point. + Return value: the 32 bit representation of the + previous code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars + 3;
-int cp = unchecked::previous (w);
-assert (cp == 0x65e5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::previous (w);
+assert (cp == 0x65e5);
 assert (w == twochars);
 

@@ -708,23 +842,28 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.

- template <typename octet_iterator, typename distance_type> void advance - (octet_iterator& it, distance_type n); +
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point.
- n: a positive integer that shows how many code points we want to + n: a positive integer that shows how many code points we want to advance.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-unchecked::advance (w, 2);
-assert (w == twochars + 5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+unchecked::advance (w, 2);
+assert (w == twochars + 5);
 

This function works only "forward". In case of a negative n, there is @@ -741,23 +880,29 @@ assert (w == twochars + 5); Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them.

- template <typename octet_iterator> typename - std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator - first, octet_iterator last); +
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+

first: an iterator to a beginning of a UTF-8 encoded code point.
- last: an iterator to a "post-end" of the last UTF-8 encoded code point - in the sequence we are trying to determine the length. It can be the beginning of a - new code point, or not.
- Return value the distance between the iterators, in code points. + last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
+assert (dist == 2);
 

This is a quicker but less safe version of utf8::distance. It does not @@ -769,26 +914,35 @@ assert (dist == 2);

Converts a UTF-16 encoded string to UTF-8.

- template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-16 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-16 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to - append the result of conversion. Return value: An iterator pointing to the - place after the appended UTF-8 string. + end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-assert (utf8result.size() == 10);    
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 

This is a quicker but less safe version of utf8::utf16to8. It does not @@ -800,28 +954,35 @@ assert (utf8result.size() == 10);

Converts an UTF-8 encoded string to UTF-16

- template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-16 string where to + result: an output iterator to the place in the UTF-16 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16 - string. + Return value: An iterator pointing to the place + after the appended UTF-16 string.

Example of use:

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
 

This is a quicker but less safe version of utf8::utf8to16. It does not @@ -833,27 +994,34 @@ assert (utf16result[3] == 0xdd1e);

Converts a UTF-32 encoded string to UTF-8.

- template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-32 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-32 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-assert (utf8result.size() == 9);
+int utf32string[] = {0x448, 0x65e5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
 

This is a quicker but less safe version of utf8::utf32to8. It does not @@ -865,27 +1033,32 @@ assert (utf8result.size() == 9);

Converts a UTF-8 encoded string to UTF-32.

- template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-8 encoded string + end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-32 string where to + result: an output iterator to the place in the UTF-32 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32 - string. + Return value: An iterator pointing to the place + after the appended UTF-32 string.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-assert (utf32result.size() == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
 

This is a quicker but less safe version of utf8::utf8to32. It does not diff --git a/v2_0/source/utf8/checked.h b/v2_0/source/utf8/checked.h index 980be27..4647016 100644 --- a/v2_0/source/utf8/checked.h +++ b/v2_0/source/utf8/checked.h @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 #include "core.h" -#include +#include namespace utf8 { @@ -152,7 +152,18 @@ namespace utf8 return cp; } + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + octet_iterator end = it; + while (internal::is_trail(*(--it))) + if (it < start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return next(temp, end); + } + /// Deprecated in versions that include "prior" template uint32_t previous(octet_iterator& it, octet_iterator pass_start) { @@ -240,37 +251,50 @@ namespace utf8 // The iterator class template class iterator { - static const typename std::iterator_traits::difference_type MAX_UTF8_SEQUENCE_LENGTH = 4; octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; public: - explicit iterator (const octet_iterator& octet_it) : it(octet_it) {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } // the default "big three" are OK uint32_t operator * () const { octet_iterator temp = it; - return next(temp, temp + MAX_UTF8_SEQUENCE_LENGTH); + return next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start && range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); } - bool operator == (const iterator& rhs) const { return (it == rhs.it); } iterator& operator ++ () { - next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + next(it, range_end); return *this; } iterator operator ++ (int) { iterator temp = *this; - next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + next(it, range_end); return temp; } iterator& operator -- () { - previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + prior(it, range_start); return *this; } iterator operator -- (int) { iterator temp = *this; - previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + prior(it, range_start); return temp; } }; // class iterator diff --git a/v2_0/source/utf8/unchecked.h b/v2_0/source/utf8/unchecked.h index 75c882d..ac019d9 100644 --- a/v2_0/source/utf8/unchecked.h +++ b/v2_0/source/utf8/unchecked.h @@ -88,13 +88,20 @@ namespace utf8 } template - uint32_t previous(octet_iterator& it) + uint32_t prior(octet_iterator& it) { while (internal::is_trail(*(--it))) ; octet_iterator temp = it; return next(temp); } + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return prior(it); + } + template void advance (octet_iterator& it, distance_type n) { diff --git a/v2_0/test_drivers/smoke_test/test.cpp b/v2_0/test_drivers/smoke_test/test.cpp index 77154ed..880aca3 100644 --- a/v2_0/test_drivers/smoke_test/test.cpp +++ b/v2_0/test_drivers/smoke_test/test.cpp @@ -22,6 +22,8 @@ int main() end = append(0x10346, u); assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); + + //next char* twochars = "\xe6\x97\xa5\xd1\x88"; char* w = twochars; @@ -41,8 +43,24 @@ int main() assert (cp == 0x0448); assert (w == threechars + 9); + //prior + w = twochars + 3; + cp = prior (w, twochars); + assert (cp == 0x65e5); + assert (w == twochars); - //previous + w = threechars + 9; + cp = prior(w, threechars); + assert (cp == 0x0448); + assert (w == threechars + 7); + cp = prior(w, threechars); + assert (cp == 0x65e5); + assert (w == threechars + 4); + cp = prior(w, threechars); + assert (cp == 0x10346); + assert (w == threechars); + + //previous (deprecated) w = twochars + 3; cp = previous (w, twochars - 1); assert (cp == 0x65e5); @@ -131,19 +149,19 @@ int main() assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); // iterator - utf8::iterator it(threechars); + utf8::iterator it(threechars, threechars, threechars + 9); utf8::iterator it2 = it; assert (it2 == it); assert (*it == 0x10346); assert (*(++it) == 0x65e5); assert ((*it++) == 0x65e5); assert (*it == 0x0448); - utf8::iterator endit (threechars + 9); + utf8::iterator endit (threechars + 9, threechars, threechars + 9); assert (++it == endit); assert (*(--it) == 0x0448); assert ((*it--) == 0x0448); assert (*it == 0x65e5); - assert (--it == utf8::iterator(threechars)); + assert (--it == utf8::iterator(threechars, threechars, threechars + 9)); assert (*it == 0x10346); ////////////////////////////////////////////////////////// @@ -179,7 +197,8 @@ int main() assert (w == threechars + 9); - //previous + //previous (calls prior internally) + w = twochars + 3; cp = unchecked::previous (w); assert (cp == 0x65e5);