From d2ee7164b6948fa38ac40bfe3d65983221979a3e Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Sat, 28 Oct 2006 16:25:52 +0000 Subject: [PATCH] Added the first version of the iterator to the code. Started upgrading the html documentation git-svn-id: http://svn.code.sf.net/p/utfcpp/code@65 a809a056-fc17-0410-9590-b4f493f8b08e --- v2_0/doc/utf8cpp.html | 1499 ++++++++++++++----------- v2_0/source/utf8/checked.h | 47 +- v2_0/test_drivers/smoke_test/test.cpp | 17 +- 3 files changed, 917 insertions(+), 646 deletions(-) diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html index e6ba254..c915572 100644 --- a/v2_0/doc/utf8cpp.html +++ b/v2_0/doc/utf8cpp.html @@ -1,97 +1,131 @@ - + - - - - -UTF8-CPP: UTF-8 with C++ in a Portable Way - - -

The Sourceforge project page

-

Table of Contents

- -

Introduction

-

Many C++ developers miss an easy and portable way of handling -Unicode encoded strings. C++ Standard is currently Unicode -agnostic, and while some work is being done to introduce Unicode to -the next incarnation called C++0x, for the moment nothing of the -sort is available. In the meantime, developers use 3rd party -libraries like ICU, OS specific capabilities, or simply roll out -their own solutions.

-

In order to easily handle UTF-8 encoded Unicode strings, I have -come up with a set of template functions. For anybody used to work -with STL algorithms, they should be easy and natural to use. The -code is freely available for any purpose - check out the license at -the beginning of the utf8.h file. Be aware, though, that while I -did some testing, this library has not been used in production yet. -If you run into bugs or performance issues, please let me know and -I'll do my best to address them.

-

The purpose of this article is not to offer an introduction to -Unicode in general, and UTF-8 in particular. If you are not -familiar with Unicode, be sure to check out Unicode Home Page or some other -source of information for Unicode. Also, it is not my aim to -advocate the use of UTF-8 encoded strings in C++ programs; if you -want to handle UTF-8 encoded strings from C++, I am sure you have -good reasons for it.

-

Examples of use

-

To illustrate the use of this utf8 library, we shall open a file -containing UTF-8 encoded text, check whether it starts with a byte order mark, -read each line into a std::string, check it for validity, convert the text to UTF-16, -and back to UTF-8:

-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-using namespace std;
-
-int main()
-{
-    if (argc != 2) {
-        cout << "\nUsage: docsample filename\n";
-        return 0;
+  
+    
+    
+    
+    
+      UTF8-CPP: UTF-8 with C++ in a Portable Way
+    
+    
+  
+  
+    

+ The Sourceforge project page +

+

+ Table of Contents +

+ +

+ Introduction +

+

+ Many C++ developers miss an easy and portable way of handling Unicode encoded + strings. C++ Standard is currently Unicode agnostic, and while some work is being + done to introduce Unicode to the next incarnation called C++0x, for the moment + nothing of the sort is available. In the meantime, developers use 3rd party + libraries like ICU, OS specific capabilities, or simply roll out their own + solutions. +

+

+ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set + of template functions. For anybody used to work with STL algorithms, they should be + easy and natural to use. The code is freely available for any purpose - check out + the license at the beginning of the utf8.h file. Be aware, though, that while I did + some testing, this library has not been used in production yet. If you run into + bugs or performance issues, please let me know and I'll do my best to address them. +

+

+ The purpose of this article is not to offer an introduction to Unicode in general, + and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out + Unicode Home Page or some other source of + information for Unicode. Also, it is not my aim to advocate the use of UTF-8 + encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from + C++, I am sure you have good reasons for it. +

+

+ Examples of use +

+

+ To illustrate the use of this utf8 library, we shall open a file containing UTF-8 + encoded text, check whether it starts with a byte order mark, read each line into a + std::string, check it for validity, convert the text to UTF-16, and + back to UTF-8: +

+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+using namespace std;
+int main()
+{
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
     // Open the test file (must be UTF-8 encoded)
     ifstream fs8(test_file_path);
-    if (!fs8.is_open()) {
+    if (!fs8.is_open()) {
     cout << "Could not open " << test_file_path << endl;
-    return 0;
+    return 0;
     }
-
     // Read the first line of the file
-    unsigned line_count = 1;
+    unsigned line_count = 1;
     string line;
-    if (!getline(fs8, line)) 
-        return 0;
-
+    if (!getline(fs8, line)) 
+        return 0;
     // Look for utf-8 byte-order mark at the beginning
-    if (line.size() > 2) {
-        if (utf8::is_bom(line.c_str()))
-          cout << "There is a byte order mark at the beginning of the file\n";
+    if (line.size() > 2) {
+        if (utf8::is_bom(line.c_str()))
+            cout << "There is a byte order mark at the beginning of the file\n";
     }
-
     // Play with all the lines in the file
-    do {
+    do {
         // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
         string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
-        if (end_it != line.end()) {
+        if (end_it != line.end()) {
             cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
             cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
         }
         // Get the line length (at least for the valid part)
-        int length = utf8::distance(line.begin(), end_it);
+        int length = utf8::distance(line.begin(), end_it);
         cout << "Length of line " << line_count << " is " << length <<  "\n";
-
         // Convert it to utf-16
         vector<unsigned short> utf16line;
         utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
@@ -99,663 +133,846 @@ int main()
         string utf8line; 
         utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
         // Confirm that the conversion went OK:
-        if (utf8line != string(line.begin(), end_it))
+        if (utf8line != string(line.begin(), end_it))
             cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
-
         getline(fs8, line);
         line_count++;
-    } while (!fs8.eof());
-
-    return 0;
+    } while (!fs8.eof());
+    return 0;
 }
 
-

In the previous code sample, we have seen the use of the following functions -from utf8 namespace: first we used is_bom -function to detect UTF-8 byte order mark at the beginning of the -file; then for each line we performed a detection of invalid UTF-8 sequences with find_invalid; -the number of characters (more precisely - the number of Unicode code points) in each line was determined -with a use of utf8::distance; finally, we have converted each line to UTF-16 encoding with -utf8to16 and back to UTF-8 with utf16to8. -

-

Reference

- -

Functions From utf8 Namespace

-

utf8::append

-

Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.

-template <typename octet_iterator> octet_iterator -append(uint32_t cp, octet_iterator result); -

cp: A 32 bit integer representing a code point to -append to the sequence.
-result: An output iterator to the place in the -sequence where to append the code point.
-Return value: An iterator pointing to the place after the -newly appended sequence.

-

Example of use:

+

+ In the previous code sample, we have seen the use of the following functions from + utf8 namespace: first we used is_bom function to detect + UTF-8 byte order mark at the beginning of the file; then for each line we performed + a detection of invalid UTF-8 sequences with find_invalid; the number + of characters (more precisely - the number of Unicode code points) in each line was + determined with a use of utf8::distance; finally, we have converted + each line to UTF-16 encoding with utf8to16 and back to UTF-8 with + utf16to8. +

+

+ Reference +

+

+ Functions From utf8 Namespace +

+

+ utf8::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

+ +template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); + +

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place after the newly appended + sequence. +

+

+ Example of use: +

-unsigned char u[5] = {0,0,0,0,0};
-
-unsigned char* end = append(0x0448, u);
-
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
 assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 
-

Note that append does not allocate any memory - it -is the burden of the caller to make sure there is enough memory -allocated for the operation. To make things more interesting, -append can add anywhere between 1 and 4 octets to the -sequence. In practice, you would most often want to use -std::back_inserter to ensure that the necessary memory -is allocated.

-

In case of an invalid code point, a -utf8::invalid_code_point exception is thrown.

-

utf8::next

-

Given the iterator to the beginning of the UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.

-template <typename octet_iterator> uint32_t -next(octet_iterator& it, octet_iterator end); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the beginning of the next -code point.
-end: end of the UTF-8 sequence to be processed. If -it gets equal to end during the -extraction of a code point, an utf8::not_enough_room -exception is thrown.
-Return value: the 32 bit representation of the processed -UTF-8 code point.

-

Example of use:

+

+ Note that append does not allocate any memory - it is the burden of + the caller to make sure there is enough memory allocated for the operation. To make + things more interesting, append can add anywhere between 1 and 4 + octets to the sequence. In practice, you would most often want to use + std::back_inserter to ensure that the necessary memory is allocated. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::next +

+

+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point and moves the iterator to the next position. +

+ template <typename octet_iterator> uint32_t next(octet_iterator& it, + octet_iterator end); +

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+ Return value: the 32 bit representation of the processed UTF-8 code point. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
-int cp = next(w, twochars + 6);
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
 assert (cp == 0x65e5);
 assert (w == twochars + 3);
 
-

This function is typically used to iterate through a UTF-8 -encoded string.

-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown.

-

utf8::previous

-

Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.

-template <typename octet_iterator> uint32_t -previous(octet_iterator& it, octet_iterator pass_start); -

it: a reference pointing to an octet within a UTF-8 -encoded string. After the function returns, it is decremented to -point to the beginning of the previous code point.
-pass_start: an iterator to the point in the sequence -where the search for the beginning of a code point is aborted if no -result was reached. It is a safety measure to prevent passing the -beginning of the string in the search for a UTF-8 lead octet.
-Return value: the 32 bit representation of the previous code -point.

-

Example of use:

+

+ This function is typically used to iterate through a UTF-8 encoded string. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. +

+

+ utf8::previous +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+ template <typename octet_iterator> uint32_t previous(octet_iterator& + it, octet_iterator pass_start); +

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ pass_start: an iterator to the point in the sequence where the search + for the beginning of a code point is aborted if no result was reached. It is a + safety measure to prevent passing the beginning of the string in the search for a + UTF-8 lead octet.
+ Return value: the 32 bit representation of the previous code point. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars + 3;
-
-int cp = previous (w, twochars - 1);
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = previous (w, twochars - 1);
 assert (cp == 0x65e5);
 assert (w == twochars);
 
-

The primary purpose of this function is to iterate backwards -through a UTF-8 encoded string. Therefore, it will -typically point to the beginning of a code point, and -pass_start will point to the octet just before the -beginning of the string to ensure we don't go backwards too far. -it is decreased until it points to a lead UTF-8 octet, -and then the UTF-8 sequence beginning with that octet is decoded to -a 32 bit representation and returned.

-

In case pass_end is reached before a UTF-8 lead -octet is hit, or if an invalid UTF-8 sequence is started by the -lead octet, an invalid_utf8 exception is thrown

-

utf8::advance

-

Advances an iterator by the specified number of code points -within an UTF-8 sequence.

-template <typename octet_iterator, typename -distance_type> void advance (octet_iterator& it, -distance_type n, octet_iterator end); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the nth following code -point.
-n: a positive integer that shows how many code points -we want to advance.
-end: end of the UTF-8 sequence to be processed. If -it gets equal to end during the -extraction of a code point, an utf8::not_enough_room -exception is thrown.

-

Example of use:

+

+ The primary purpose of this function is to iterate backwards through a UTF-8 + encoded string. Therefore, it will typically point to the beginning of + a code point, and pass_start will point to the octet just before the + beginning of the string to ensure we don't go backwards too far. it is + decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence + beginning with that octet is decoded to a 32 bit representation and returned. +

+

+ In case pass_end is reached before a UTF-8 lead octet is hit, or if an + invalid UTF-8 sequence is started by the lead octet, an invalid_utf8 + exception is thrown +

+

+ utf8::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

+ template <typename octet_iterator, typename distance_type> void advance + (octet_iterator& it, distance_type n, octet_iterator end); +

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars;
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
 advance (w, 2, twochars + 6);
-
 assert (w == twochars + 5);
 
-

This function works only "forward". In case of a negative -n, there is no effect.

-

In case of an invalid code point, a -utf8::invalid_code_point exception is thrown.

-

utf8::distance

-

Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.

-template <typename octet_iterator> typename -std::iterator_traits<octet_iterator>::difference_type -distance (octet_iterator first, octet_iterator last); -

first: an iterator to a beginning of a UTF-8 -encoded code point.
-last: an iterator to a "post-end" of the last UTF-8 -encoded code point in the sequence we are trying to determine the -length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code -points.

-

Example of use:

+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

+ template <typename octet_iterator> typename + std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator + first, octet_iterator last); +

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code point + in the sequence we are trying to determine the length. It can be the beginning of a + new code point, or not.
+ Return value the distance between the iterators, in code points. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
 size_t dist = utf8::distance(twochars, twochars + 5);
-
 assert (dist == 2);
 
-

This function is used to find the length (in code points) of a -UTF-8 encoded string. The reason it is called distance, -rather than, say, length is mainly because developers are -used that length is an O(1) function. Computing the length -of an UTF-8 string is a linear operation, and it looked better to -model it after std::distance algorithm.

-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -last does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::utf16to8

-

Converts a UTF-16 encoded string to UTF-8.

-template <typename u16bit_iterator, typename -octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, -u16bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-16 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-16 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ This function is used to find the length (in code points) of a UTF-8 encoded + string. The reason it is called distance, rather than, say, + length is mainly because developers are used that length is an + O(1) function. Computing the length of an UTF-8 string is a linear operation, and + it looked better to model it after std::distance algorithm. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If last does not point to the past-of-end of a UTF-8 seqence, + a utf8::not_enough_room exception is thrown. +

+

+ utf8::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

+ template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded string + to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8 + string. +

+

+ Example of use: +

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
 utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-
 assert (utf8result.size() == 10);    
 
-

In case of invalid UTF-16 sequence, a -utf8::invalid_utf16 exception is thrown.

-

utf8::utf8to16

-

Converts an UTF-8 encoded string to UTF-16

-template <typename u16bit_iterator, typename -octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, -octet_iterator end, u16bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert. < br /> end: an -iterator pointing to pass-the-end of the UTF-8 encoded string to -convert.
-result: an output iterator to the place in the UTF-16 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string.

-

Example of use:

+

+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16 exception is + thrown. +

+

+ utf8::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

+ template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-16 + string. +

+

+ Example of use: +

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
 utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-
 assert (utf16result.size() == 4);
 assert (utf16result[2] == 0xd834);
 assert (utf16result[3] == 0xdd1e);
 
-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -end does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::utf32to8

-

Converts a UTF-32 encoded string to UTF-8.

-template <typename octet_iterator, typename -u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start, -u32bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-32 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-32 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+ template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded string + to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8 + string. +

+

+ Example of use: +

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
 utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-
 assert (utf8result.size() == 9);
 
-

In case of invalid UTF-32 string, a -utf8::invalid_code_point exception is thrown.

-

utf8::utf8to32

-

Converts a UTF-8 encoded string to UTF-32.

-template <typename octet_iterator, typename -u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start, -octet_iterator end, u32bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert.
-end: an iterator pointing to pass-the-end of the UTF-8 -encoded string to convert.
-result: an output iterator to the place in the UTF-32 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string.

-

Example of use:

+

+ In case of invalid UTF-32 string, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+ template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-32 + string. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
 utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-
 assert (utf32result.size() == 2);
 
-

In case of an invalid UTF-8 seqence, a -utf8::invalid_utf8 exception is thrown. If -end does not point to the past-of-end of a UTF-8 -seqence, a utf8::not_enough_room exception is -thrown.

-

utf8::find_invalid

-

Detects an invalid sequence within a UTF-8 string.

-template <typename octet_iterator> octet_iterator -find_invalid(octet_iterator start, octet_iterator end); -

start: an iterator pointing to the beginning of the -UTF-8 string to test for validity.
-end: an iterator pointing to pass-the-end of the UTF-8 -string to test for validity.
-Return value: an iterator pointing to the first invalid -octet in the UTF-8 string. In case none were found, equals -end.

-

Example of use:

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::find_invalid +

+

+ Detects an invalid sequence within a UTF-8 string. +

+ template <typename octet_iterator> octet_iterator + find_invalid(octet_iterator start, octet_iterator end); +

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: an iterator pointing to the first invalid octet in the UTF-8 + string. In case none were found, equals end. +

+

+ Example of use: +

-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-
-char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
 assert (invalid == utf_invalid + 5);
 
-

This function is typically used to make sure a UTF-8 string is -valid before processing it with other functions. It is especially -important to call it if before doing any of the unchecked -operations on it.

-

utf8::is_valid

-

Checks whether a sequence of octets is a valid UTF-8 string.

-template <typename octet_iterator> bool -is_valid(octet_iterator start, octet_iterator end); -

start: an iterator pointing to the beginning of the -UTF-8 string to test for validity.
-end: an iterator pointing to pass-the-end of the UTF-8 -string to test for validity.
-Return value: true if the sequence is a valid -UTF-8 string; false if not.

-Example of use: +

+ This function is typically used to make sure a UTF-8 string is valid before + processing it with other functions. It is especially important to call it if before + doing any of the unchecked operations on it. +

+

+ utf8::is_valid +

+

+ Checks whether a sequence of octets is a valid UTF-8 string. +

+ template <typename octet_iterator> bool is_valid(octet_iterator start, + octet_iterator end); +

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: true if the sequence is a valid UTF-8 string; + false if not. +

+ Example of use:
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-
-bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
-
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
 assert (bvalid == false);
 
-

is_valid is a shorthand for -find_invalid(start, end) == end;. You may want to use -it to make sure that a byte seqence is a valid UTF-8 string without -the need to know where it fails if it is not valid.

-

utf8::replace_invalid

-

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

-

template <typename octet_iterator, typename output_iterator> -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); -

-

template <typename octet_iterator, typename output_iterator> -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); -

-

-start: an iterator pointing to the beginning of the -UTF-8 string to look for invalid UTF-8 sequences.
-end: an iterator pointing to pass-the-end of the UTF-8 -string to look for invalid UTF-8 sequences.
-out: An output iterator to the range where the result of replacement is stored.
-replacement: A Unicode code point for the replacement marker. The version without this -parameter assumes the value 0xfffd
-Return value: An iterator pointing to the place after the UTF-8 string with -replaced invalid sequences.

-

Example of use:

+

+ is_valid is a shorthand for find_invalid(start, end) == + end;. You may want to use it to make sure that a byte seqence is a valid + UTF-8 string without the need to know where it fails if it is not valid. +

+

+ utf8::replace_invalid +

+

+ Replaces all invalid UTF-8 sequences within a string with a replacement marker. +

+

+ template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, + output_iterator out, uint32_t replacement); +

+

+ template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, + output_iterator out); +

+

+ start: an iterator pointing to the beginning of the UTF-8 string to + look for invalid UTF-8 sequences.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to look + for invalid UTF-8 sequences.
+ out: An output iterator to the range where the result of replacement + is stored.
+ replacement: A Unicode code point for the replacement marker. The + version without this parameter assumes the value 0xfffd
+ Return value: An iterator pointing to the place after the UTF-8 string with + replaced invalid sequences. +

+

+ Example of use: +

-char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-vector<char> replace_invalid_result;
-
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
 replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
-
 bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
 assert (bvalid);
-char* fixed_invalid_sequence = "a????z";
+char* fixed_invalid_sequence = "a????z";
 assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
 
-

-replace_invalid does not perform in-place replacement of invalid sequences. Rather, it produces a copy -of the original string with the invalid sequences replaced with a replacement marker. Therefore, out must -not be in the [start, end] range. -

-

If end does not point to the past-of-end of a UTF-8 sequence, a utf8::not_enough_room -exception is thrown.

-

utf8::is_bom

-

Checks whether a sequence of three octets is a UTF-8 byte order -mark (BOM)

-template <typename octet_iterator> bool is_bom -(octet_iterator it); -

it: beginning of the 3-octet sequence to check
-Return value: true if the sequence is UTF-8 -byte order mark; false if not.

-

Example of use:

+

+ replace_invalid does not perform in-place replacement of invalid + sequences. Rather, it produces a copy of the original string with the invalid + sequences replaced with a replacement marker. Therefore, out must not + be in the [start, end] range. +

+

+ If end does not point to the past-of-end of a UTF-8 sequence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::is_bom +

+

+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) +

+ template <typename octet_iterator> bool is_bom (octet_iterator + it); +

+ it: beginning of the 3-octet sequence to check
+ Return value: true if the sequence is UTF-8 byte order mark; + false if not. +

+

+ Example of use: +

-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-
-bool bbom = is_bom(byte_order_mark);
-
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
 assert (bbom == true);
 
-

The typical use of this function is to check the first three -bytes of a file. If they form the UTF-8 BOM, we want to skip them -before processing the actual UTF-8 encoded text.

-

Functions From utf8::unchecked Namespace

-

utf8::unchecked::append

-

Encodes a 32 bit code point as a UTF-8 sequence of octets and -appends the sequence to a UTF-8 string.

-template <typename octet_iterator> octet_iterator -append(uint32_t cp, octet_iterator result); -

cp: A 32 bit integer representing a code point to -append to the sequence.
-result: An output iterator to the place in the -sequence where to append the code point.
-Return value: An iterator pointing to the place after the -newly appended sequence.

-

Example of use:

+

+ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +

+

+ Functions From utf8::unchecked Namespace +

+

+ utf8::unchecked::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

+ template <typename octet_iterator> octet_iterator append(uint32_t cp, + octet_iterator result); +

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place after the newly appended + sequence. +

+

+ Example of use: +

-unsigned char u[5] = {0,0,0,0,0};
-
-unsigned char* end = unchecked::append(0x0448, u);
-
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
 assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 
-

This is a quicker but less safe version of -utf8::append. It does not check for validity of the -supplied code point, and may produce an invalid UTF-8 sequence.

-

utf8::unchecked::next

-

Given the iterator to the beginning of a UTF-8 sequence, it -returns the code point and moves the iterator to the next -position.

-template <typename octet_iterator> uint32_t -next(octet_iterator& it); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the beginning of the next -code point.
-Return value: the 32 bit representation of the processed -UTF-8 code point.

-

Example of use:

+

+ This is a quicker but less safe version of utf8::append. It does not + check for validity of the supplied code point, and may produce an invalid UTF-8 + sequence. +

+

+ utf8::unchecked::next +

+

+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point + and moves the iterator to the next position. +

+ template <typename octet_iterator> uint32_t next(octet_iterator& + it); +

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ Return value: the 32 bit representation of the processed UTF-8 code point. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
-int cp = unchecked::next(w);
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
 assert (cp == 0x65e5);
 assert (w == twochars + 3);
 
-

This is a quicker but less safe version of -utf8::next. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::previous

-

Given a reference to an iterator pointing to an octet in a UTF-8 -seqence, it decreases the iterator until it hits the beginning of -the previous UTF-8 encoded code point and returns the 32 bits -representation of the code point.

-template <typename octet_iterator> uint32_t -previous(octet_iterator& it); -

it: a reference pointing to an octet within a UTF-8 -encoded string. After the function returns, it is decremented to -point to the beginning of the previous code point.
-Return value: the 32 bit representation of the previous code -point.

-

Example of use:

+

+ This is a quicker but less safe version of utf8::next. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::previous +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+ template <typename octet_iterator> uint32_t previous(octet_iterator& + it); +

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ Return value: the 32 bit representation of the previous code point. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars + 3;
-
-int cp = unchecked::previous (w);
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::previous (w);
 assert (cp == 0x65e5);
 assert (w == twochars);
 
-

This is a quicker but less safe version of -utf8::previous. It does not check for validity of the -supplied UTF-8 sequence and offers no boundary checking.

-

utf8::unchecked::advance

-

Advances an iterator by the specified number of code points -within an UTF-8 sequence.

-template <typename octet_iterator, typename -distance_type> void advance (octet_iterator& it, -distance_type n); -

it: a reference to an iterator pointing to the -beginning of an UTF-8 encoded code point. After the function -returns, it is incremented to point to the nth following code -point.
-n: a positive integer that shows how many code points -we want to advance.

-

Example of use:

+

+ This is a quicker but less safe version of utf8::previous. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

+ template <typename octet_iterator, typename distance_type> void advance + (octet_iterator& it, distance_type n); +

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
 unchecked::advance (w, 2);
-
 assert (w == twochars + 5);
 
-

This function works only "forward". In case of a negative -n, there is no effect.

-

This is a quicker but less safe version of -utf8::advance. It does not check for validity of the -supplied UTF-8 sequence and offers no boundary checking.

-

utf8::unchecked::distance

-

Given the iterators to two UTF-8 encoded code points in a -seqence, returns the number of code points between them.

-template <typename octet_iterator> typename -std::iterator_traits<octet_iterator>::difference_type -distance (octet_iterator first, octet_iterator last); -

first: an iterator to a beginning of a UTF-8 -encoded code point.
-last: an iterator to a "post-end" of the last UTF-8 -encoded code point in the sequence we are trying to determine the -length. It can be the beginning of a new code point, or not.
-Return value the distance between the iterators, in code -points.

-

Example of use:

+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ This is a quicker but less safe version of utf8::advance. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

+ template <typename octet_iterator> typename + std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator + first, octet_iterator last); +

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code point + in the sequence we are trying to determine the length. It can be the beginning of a + new code point, or not.
+ Return value the distance between the iterators, in code points. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
 size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
-
 assert (dist == 2);
 
-

This is a quicker but less safe version of -utf8::distance. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::utf16to8

-

Converts a UTF-16 encoded string to UTF-8.

-template <typename u16bit_iterator, typename -octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, -u16bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-16 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-16 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string.

-

Example of use:

+

+ This is a quicker but less safe version of utf8::distance. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

+ template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded string + to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion. Return value: An iterator pointing to the + place after the appended UTF-8 string. +

+

+ Example of use: +

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
 unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-
 assert (utf8result.size() == 10);    
 
-

This is a quicker but less safe version of -utf8::utf16to8. It does not check for validity of the -supplied UTF-16 sequence.

-

utf8::unchecked::utf8to16

-

Converts an UTF-8 encoded string to UTF-16

-template <typename u16bit_iterator, typename -octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, -octet_iterator end, u16bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert. < br /> end: an -iterator pointing to pass-the-end of the UTF-8 encoded string to -convert.
-result: an output iterator to the place in the UTF-16 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-16 string. -

- -

Example of use:

+

+ This is a quicker but less safe version of utf8::utf16to8. It does not + check for validity of the supplied UTF-16 sequence. +

+

+ utf8::unchecked::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

+ template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-16 + string. +

+

+ Example of use: +

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
 unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-
 assert (utf16result.size() == 4);
 assert (utf16result[2] == 0xd834);
 assert (utf16result[3] == 0xdd1e);
 
-

This is a quicker but less safe version of -utf8::utf8to16. It does not check for validity of the -supplied UTF-8 sequence.

-

utf8::unchecked::utf32to8

-

Converts a UTF-32 encoded string to UTF-8.

-template <typename octet_iterator, typename -u32bit_iterator> octet_iterator utf32to8 (u32bit_iterator start, -u32bit_iterator end, octet_iterator result); -

start: an iterator pointing to the beginning of the -UTF-32 encoded string to convert.
-end: an iterator pointing to pass-the-end of the -UTF-32 encoded string to convert.
-result: an output iterator to the place in the UTF-8 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-8 string. -

-

Example of use:

+

+ This is a quicker but less safe version of utf8::utf8to16. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+ template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded string + to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-8 + string. +

+

+ Example of use: +

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
 utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-
 assert (utf8result.size() == 9);
 
-

This is a quicker but less safe version of -utf8::utf32to8. It does not check for validity of the -supplied UTF-32 sequence.

-

utf8::unchecked::utf8to32

-

Converts a UTF-8 encoded string to UTF-32.

-template <typename octet_iterator, typename -u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start, -octet_iterator end, u32bit_iterator result); -

start: an iterator pointing to the beginning of the -UTF-8 encoded string to convert.
-end: an iterator pointing to pass-the-end of the UTF-8 -encoded string to convert.
-result: an output iterator to the place in the UTF-32 -string where to append the result of conversion.
-Return value: An iterator pointing to the place after the appended UTF-32 string. -

-

Example of use:

+

+ This is a quicker but less safe version of utf8::utf32to8. It does not + check for validity of the supplied UTF-32 sequence. +

+

+ utf8::unchecked::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+ template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator + result); +

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place after the appended UTF-32 + string. +

+

+ Example of use: +

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
 unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-
 assert (utf32result.size() == 2);
 
-

This is a quicker but less safe version of -utf8::utf8to32. It does not check for validity of the -supplied UTF-8 sequence.

-

Points of interest

-

Design goals and decisions

-

The library was designed to be:

-
    -
  1. Generic: for better or worse, there are many C++ string classes -out there, and the library should work with as many of them as -possible.
  2. -
  3. Portable: the library should be portable both accross different -platforms and compilers. The only non-portable code is a small -section that declares unsigned integers of different sizes: three -typedefs. They can be changed by the users of the library if they -don't match their platform. The default setting should work for -Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix -derivatives.
  4. -
  5. Lightweight: follow the "pay only for what you use" -guidline.
  6. -
  7. Unintrusive: avoid forcing any particular design or even -programming style on the user. This is a library, not a -framework.
  8. -
-

Alternatives

-

In case you want to look into other means of working with UTF-8 -strings from C++, here is the list of solutions I am aware of:

-
    -
  1. ICU Library. It is -very powerful, complete, feature-rich, mature, and widely used. -Also big, intrusive, non-generic, and doesn't play well with the -Standard Library. I definitelly recommend looking at ICU even if -you don't plan to use it.
  2. -
  3. Glib::ustring. -A class specifically made to work with UTF-8 strings, and also feel -like std::string. If you prefer to have yet another -string class in your code, it may be worth a look. Be aware of the -licensing issues, though.
  4. -
  5. Platform dependent solutions: Windows and POSIX have functions -to convert strings from one encoding to another. That is only a -subset of what my library offers, but if that is all you need it -may be good enough, especially given the fact that these functions -are mature and tested in production.
  6. -
-

Conclusion

-

Until Unicode becomes officially recognized by the C++ Standard -Library, we need to use other means to work with UTF-8 strings. -Template functions I describe in this article may be a good step in -this direction.

-

References

-
    -
  1. The Unicode -Consortium.
  2. -
  3. ICU Library.
  4. -
  5. UTF-8 at -Wikipedia
  6. -
  7. UTF-8 and Unicode FAQ for Unix/Linux
  8. -
- +

+ This is a quicker but less safe version of utf8::utf8to32. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ Points of interest +

+

+ Design goals and decisions +

+

+ The library was designed to be: +

+
    +
  1. + Generic: for better or worse, there are many C++ string classes out there, and + the library should work with as many of them as possible. +
  2. +
  3. + Portable: the library should be portable both accross different platforms and + compilers. The only non-portable code is a small section that declares unsigned + integers of different sizes: three typedefs. They can be changed by the users of + the library if they don't match their platform. The default setting should work + for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. +
  4. +
  5. + Lightweight: follow the "pay only for what you use" guidline. +
  6. +
  7. + Unintrusive: avoid forcing any particular design or even programming style on the + user. This is a library, not a framework. +
  8. +
+

+ Alternatives +

+

+ In case you want to look into other means of working with UTF-8 strings from C++, + here is the list of solutions I am aware of: +

+
    +
  1. + ICU Library. It is very powerful, + complete, feature-rich, mature, and widely used. Also big, intrusive, + non-generic, and doesn't play well with the Standard Library. I definitelly + recommend looking at ICU even if you don't plan to use it. +
  2. +
  3. + Glib::ustring. + A class specifically made to work with UTF-8 strings, and also feel like + std::string. If you prefer to have yet another string class in your + code, it may be worth a look. Be aware of the licensing issues, though. +
  4. +
  5. + Platform dependent solutions: Windows and POSIX have functions to convert strings + from one encoding to another. That is only a subset of what my library offers, + but if that is all you need it may be good enough, especially given the fact that + these functions are mature and tested in production. +
  6. +
+

+ Conclusion +

+

+ Until Unicode becomes officially recognized by the C++ Standard Library, we need to + use other means to work with UTF-8 strings. Template functions I describe in this + article may be a good step in this direction. +

+

+ References +

+
    +
  1. + The Unicode Consortium. +
  2. +
  3. + ICU Library. +
  4. +
  5. + UTF-8 at Wikipedia +
  6. +
  7. + UTF-8 and Unicode FAQ for + Unix/Linux +
  8. +
+ diff --git a/v2_0/source/utf8/checked.h b/v2_0/source/utf8/checked.h index 459c450..980be27 100644 --- a/v2_0/source/utf8/checked.h +++ b/v2_0/source/utf8/checked.h @@ -38,7 +38,7 @@ namespace utf8 uint32_t cp; public: invalid_code_point(uint32_t cp) : cp(cp) {} - const char* what() { return "Invalid code point"; } + virtual const char* what() const throw() { return "Invalid code point"; } uint32_t code_point() const {return cp;} }; @@ -46,7 +46,7 @@ namespace utf8 uint8_t u8; public: invalid_utf8 (uint8_t u) : u8(u) {} - const char* what() { return "Invalid UTF-8"; } + virtual const char* what() const throw() { return "Invalid UTF-8"; } uint8_t utf8_octet() const {return u8;} }; @@ -54,13 +54,13 @@ namespace utf8 uint16_t u16; public: invalid_utf16 (uint16_t u) : u16(u) {} - const char* what() { return "Invalid UTF-16"; } + virtual const char* what() const throw() { return "Invalid UTF-16"; } uint16_t utf16_word() const {return u16;} }; class not_enough_room : public std::exception { public: - const char* what() { return "Not enough space"; } + virtual const char* what() const throw() { return "Not enough space"; } }; /// The library API - functions intended to be called by the users @@ -236,6 +236,45 @@ namespace utf8 return result; } + + // The iterator class + template + class iterator { + static const typename std::iterator_traits::difference_type MAX_UTF8_SEQUENCE_LENGTH = 4; + octet_iterator it; + public: + explicit iterator (const octet_iterator& octet_it) : it(octet_it) {} + // the default "big three" are OK + uint32_t operator * () const + { + octet_iterator temp = it; + return next(temp, temp + MAX_UTF8_SEQUENCE_LENGTH); + } + bool operator == (const iterator& rhs) const { return (it == rhs.it); } + iterator& operator ++ () + { + next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + return temp; + } + iterator& operator -- () + { + previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + return temp; + } + }; // class iterator + } // namespace utf8 #endif //header guard diff --git a/v2_0/test_drivers/smoke_test/test.cpp b/v2_0/test_drivers/smoke_test/test.cpp index e6243c5..77154ed 100644 --- a/v2_0/test_drivers/smoke_test/test.cpp +++ b/v2_0/test_drivers/smoke_test/test.cpp @@ -1,7 +1,6 @@ #include #include -#include #include "../../source/utf8.h" using namespace utf8; using namespace std; @@ -131,6 +130,22 @@ int main() char* fixed_invalid_sequence = "a????z"; assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); + // iterator + utf8::iterator it(threechars); + utf8::iterator it2 = it; + assert (it2 == it); + assert (*it == 0x10346); + assert (*(++it) == 0x65e5); + assert ((*it++) == 0x65e5); + assert (*it == 0x0448); + utf8::iterator endit (threechars + 9); + assert (++it == endit); + assert (*(--it) == 0x0448); + assert ((*it--) == 0x0448); + assert (*it == 0x65e5); + assert (--it == utf8::iterator(threechars)); + assert (*it == 0x10346); + ////////////////////////////////////////////////////////// //// Unchecked variants //////////////////////////////////////////////////////////