diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index c915572..1228c49 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -6,50 +6,65 @@ + UTF8-CPP: UTF-8 with C++ in a Portable Way +

The Sourceforge project page

-

- Table of Contents -

- +
+

+ Table of Contents +

+ +

Introduction

@@ -91,54 +106,64 @@ #include <iostream> #include <string> #include <vector> -using namespace std; -int main() +using namespace std; +int main() { - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; } - const char* test_file_path = argv[1]; - // Open the test file (must be UTF-8 encoded) + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) ifstream fs8(test_file_path); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 0; + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; } - // Read the first line of the file - unsigned line_count = 1; + // Read the first line of the file + unsigned line_count = 1; string line; - if (!getline(fs8, line)) - return 0; - // Look for utf-8 byte-order mark at the beginning - if (line.size() > 2) { - if (utf8::is_bom(line.c_str())) - cout << "There is a byte order mark at the beginning of the file\n"; + if (!getline(fs8, line)) + return 0; + // Look for utf-8 byte-order mark at the beginning + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; } - // Play with all the lines in the file - do { + // Play with all the lines in the file + do { // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); - if (end_it != line.end()) { - cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; - cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; } - // Get the line length (at least for the valid part) - int length = utf8::distance(line.begin(), end_it); - cout << "Length of line " << line_count << " is " << length << "\n"; - // Convert it to utf-16 + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + // Convert it to utf-16 vector<unsigned short> utf16line; utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); - // And back to utf-8; + // And back to utf-8 string utf8line; utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); - // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) - cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; getline(fs8, line); line_count++; - } while (!fs8.eof()); - return 0; + } while (!fs8.eof()); + return 0; }

@@ -164,25 +189,35 @@ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

- -template <typename octet_iterator> +
+template <typename octet_iterator>
 octet_iterator append(uint32_t cp, octet_iterator result);
-     
+   
+

cp: A 32 bit integer representing a code point to append to the sequence.
- result: An output iterator to the place in the sequence where to + result: An output iterator to the place in the sequence where to append the code point.
- Return value: An iterator pointing to the place after the newly appended - sequence. + Return value: An iterator pointing to the place + after the newly appended sequence.

Example of use:

-unsigned char u[5] = {0,0,0,0,0};
-unsigned char* end = append(0x0448, u);
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 

Note that append does not allocate any memory - it is the burden of @@ -202,26 +237,32 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3 Given the iterator to the beginning of the UTF-8 sequence, it returns the code point and moves the iterator to the next position.

- template <typename octet_iterator> uint32_t next(octet_iterator& it, - octet_iterator end); +
+template <typename octet_iterator> 
+uint32_t next(octet_iterator& it, octet_iterator end);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
- end: end of the UTF-8 sequence to be processed. If it + end: end of the UTF-8 sequence to be processed. If it gets equal to end during the extraction of a code point, an utf8::not_enough_room exception is thrown.
- Return value: the 32 bit representation of the processed UTF-8 code point. + Return value: the 32 bit representation of the + processed UTF-8 code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-int cp = next(w, twochars + 6);
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
 

This function is typically used to iterate through a UTF-8 encoded string. @@ -238,26 +279,34 @@ assert (w == twochars + 3); decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

- template <typename octet_iterator> uint32_t previous(octet_iterator& - it, octet_iterator pass_start); +
+template <typename octet_iterator> 
+uint32_t previous(octet_iterator& it, octet_iterator pass_start);
+   
+

it: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
- pass_start: an iterator to the point in the sequence where the search + pass_start: an iterator to the point in the sequence where the search for the beginning of a code point is aborted if no result was reached. It is a safety measure to prevent passing the beginning of the string in the search for a UTF-8 lead octet.
- Return value: the 32 bit representation of the previous code point. + Return value: the 32 bit representation of the + previous code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars + 3;
-int cp = previous (w, twochars - 1);
-assert (cp == 0x65e5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = previous (w, twochars - 1);
+assert (cp == 0x65e5);
 assert (w == twochars);
 

@@ -280,15 +329,20 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.

- template <typename octet_iterator, typename distance_type> void advance - (octet_iterator& it, distance_type n, octet_iterator end); +
+template <typename octet_iterator, typename distance_type> 
+void advance (octet_iterator& it, distance_type n, octet_iterator end);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point.
- n: a positive integer that shows how many code points we want to + n: a positive integer that shows how many code points we want to advance.
- end: end of the UTF-8 sequence to be processed. If it + end: end of the UTF-8 sequence to be processed. If it gets equal to end during the extraction of a code point, an utf8::not_enough_room exception is thrown.

@@ -296,10 +350,11 @@ assert (w == twochars); Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-unsigned char* w = twochars;
-advance (w, 2, twochars + 6);
-assert (w == twochars + 5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
+advance (w, 2, twochars + 6);
+assert (w == twochars + 5);
 

This function works only "forward". In case of a negative n, there is @@ -316,23 +371,29 @@ assert (w == twochars + 5); Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them.

- template <typename octet_iterator> typename - std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator - first, octet_iterator last); +
+template <typename octet_iterator> 
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+   
+

first: an iterator to a beginning of a UTF-8 encoded code point.
- last: an iterator to a "post-end" of the last UTF-8 encoded code point - in the sequence we are trying to determine the length. It can be the beginning of a - new code point, or not.
- Return value the distance between the iterators, in code points. + last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-size_t dist = utf8::distance(twochars, twochars + 5);
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::distance(twochars, twochars + 5);
+assert (dist == 2);
 

This function is used to find the length (in code points) of a UTF-8 encoded @@ -352,27 +413,35 @@ assert (dist == 2);

Converts a UTF-16 encoded string to UTF-8.

- template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-16 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-16 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-assert (utf8result.size() == 10);    
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 

In case of invalid UTF-16 sequence, a utf8::invalid_utf16 exception is @@ -384,28 +453,35 @@ assert (utf8result.size() == 10);

Converts an UTF-8 encoded string to UTF-16

- template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-16 string where to + result: an output iterator to the place in the UTF-16 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16 - string. + Return value: An iterator pointing to the place + after the appended UTF-16 string.

Example of use:

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
 

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is @@ -418,27 +494,33 @@ assert (utf16result[3] == 0xdd1e);

Converts a UTF-32 encoded string to UTF-8.

- template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-32 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-32 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-assert (utf8result.size() == 9);
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
 

In case of invalid UTF-32 string, a utf8::invalid_code_point exception @@ -450,27 +532,33 @@ assert (utf8result.size() == 9);

Converts a UTF-8 encoded string to UTF-32.

- template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-8 encoded string + end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-32 string where to + result: an output iterator to the place in the UTF-32 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32 - string. + Return value: An iterator pointing to the place + after the appended UTF-32 string.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-assert (utf32result.size() == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
 

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is @@ -483,23 +571,30 @@ assert (utf32result.size() == 2);

Detects an invalid sequence within a UTF-8 string.

- template <typename octet_iterator> octet_iterator - find_invalid(octet_iterator start, octet_iterator end); +
+template <typename octet_iterator> 
+octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+

start: an iterator pointing to the beginning of the UTF-8 string to test for validity.
- end: an iterator pointing to pass-the-end of the UTF-8 string to test + end: an iterator pointing to pass-the-end of the UTF-8 string to test for validity.
- Return value: an iterator pointing to the first invalid octet in the UTF-8 - string. In case none were found, equals end. + Return value: an iterator pointing to the first + invalid octet in the UTF-8 string. In case none were found, equals + end.

Example of use:

-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
-assert (invalid == utf_invalid + 5);
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+assert (invalid == utf_invalid + 5);
 

This function is typically used to make sure a UTF-8 string is valid before @@ -512,20 +607,26 @@ assert (invalid == utf_invalid + 5);

Checks whether a sequence of octets is a valid UTF-8 string.

- template <typename octet_iterator> bool is_valid(octet_iterator start, - octet_iterator end); +
+template <typename octet_iterator> 
+bool is_valid(octet_iterator start, octet_iterator end);
+   
+

start: an iterator pointing to the beginning of the UTF-8 string to test for validity.
- end: an iterator pointing to pass-the-end of the UTF-8 string to test + end: an iterator pointing to pass-the-end of the UTF-8 string to test for validity.
- Return value: true if the sequence is a valid UTF-8 string; - false if not. + Return value: true if the sequence + is a valid UTF-8 string; false if not.

Example of use:
-char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
-bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
 assert (bvalid == false);
 

@@ -539,38 +640,42 @@ assert (bvalid == false);

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

-

- template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, - output_iterator out, uint32_t replacement); -

-

- template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, - output_iterator out); -

+
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+   
+

start: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences.
- end: an iterator pointing to pass-the-end of the UTF-8 string to look + end: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences.
- out: An output iterator to the range where the result of replacement + out: An output iterator to the range where the result of replacement is stored.
- replacement: A Unicode code point for the replacement marker. The + replacement: A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
- Return value: An iterator pointing to the place after the UTF-8 string with - replaced invalid sequences. + Return value: An iterator pointing to the place + after the UTF-8 string with replaced invalid sequences.

Example of use:

-char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
-vector<char> replace_invalid_result;
-replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
 bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
 assert (bvalid);
-char* fixed_invalid_sequence = "a????z";
+char* fixed_invalid_sequence = "a????z";
 assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
 

@@ -589,20 +694,25 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),

Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)

- template <typename octet_iterator> bool is_bom (octet_iterator - it); +
+template <typename octet_iterator> 
+bool is_bom (octet_iterator it);
+

it: beginning of the 3-octet sequence to check
- Return value: true if the sequence is UTF-8 byte order mark; - false if not. + Return value: true if the sequence + is UTF-8 byte order mark; false if not.

Example of use:

-unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
-bool bbom = is_bom(byte_order_mark);
-assert (bbom == true);
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
+assert (bbom == true);
 

The typical use of this function is to check the first three bytes of a file. If @@ -619,23 +729,35 @@ assert (bbom == true); Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

- template <typename octet_iterator> octet_iterator append(uint32_t cp, - octet_iterator result); +
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
+

cp: A 32 bit integer representing a code point to append to the sequence.
- result: An output iterator to the place in the sequence where to + result: An output iterator to the place in the sequence where to append the code point.
- Return value: An iterator pointing to the place after the newly appended - sequence. + Return value: An iterator pointing to the place + after the newly appended sequence.

Example of use:

-unsigned char u[5] = {0,0,0,0,0};
-unsigned char* end = unchecked::append(0x0448, u);
-assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
 

This is a quicker but less safe version of utf8::append. It does not @@ -649,23 +771,29 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3 Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position.

- template <typename octet_iterator> uint32_t next(octet_iterator& - it); +
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
- Return value: the 32 bit representation of the processed UTF-8 code point. + Return value: the 32 bit representation of the + processed UTF-8 code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-int cp = unchecked::next(w);
-assert (cp == 0x65e5);
-assert (w == twochars + 3);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
 

This is a quicker but less safe version of utf8::next. It does not @@ -679,22 +807,28 @@ assert (w == twochars + 3); decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

- template <typename octet_iterator> uint32_t previous(octet_iterator& - it); +
+template <typename octet_iterator>
+uint32_t previous(octet_iterator& it);
+   
+

it: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
- Return value: the 32 bit representation of the previous code point. + Return value: the 32 bit representation of the + previous code point.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars + 3;
-int cp = unchecked::previous (w);
-assert (cp == 0x65e5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::previous (w);
+assert (cp == 0x65e5);
 assert (w == twochars);
 

@@ -708,23 +842,28 @@ assert (w == twochars); Advances an iterator by the specified number of code points within an UTF-8 sequence.

- template <typename octet_iterator, typename distance_type> void advance - (octet_iterator& it, distance_type n); +
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n);
+   
+

it: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point.
- n: a positive integer that shows how many code points we want to + n: a positive integer that shows how many code points we want to advance.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-char* w = twochars;
-unchecked::advance (w, 2);
-assert (w == twochars + 5);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+unchecked::advance (w, 2);
+assert (w == twochars + 5);
 

This function works only "forward". In case of a negative n, there is @@ -741,23 +880,29 @@ assert (w == twochars + 5); Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them.

- template <typename octet_iterator> typename - std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator - first, octet_iterator last); +
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+

first: an iterator to a beginning of a UTF-8 encoded code point.
- last: an iterator to a "post-end" of the last UTF-8 encoded code point - in the sequence we are trying to determine the length. It can be the beginning of a - new code point, or not.
- Return value the distance between the iterators, in code points. + last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
-assert (dist == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
+assert (dist == 2);
 

This is a quicker but less safe version of utf8::distance. It does not @@ -769,26 +914,35 @@ assert (dist == 2);

Converts a UTF-16 encoded string to UTF-8.

- template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-16 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-16 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to - append the result of conversion. Return value: An iterator pointing to the - place after the appended UTF-8 string. + end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
-vector<unsigned char> utf8result;
-unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
-assert (utf8result.size() == 10);    
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
 

This is a quicker but less safe version of utf8::utf16to8. It does not @@ -800,28 +954,35 @@ assert (utf8result.size() == 10);

Converts an UTF-8 encoded string to UTF-16

- template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator - result); +
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-16 string where to + result: an output iterator to the place in the UTF-16 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-16 - string. + Return value: An iterator pointing to the place + after the appended UTF-16 string.

Example of use:

-char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
-vector <unsigned short> utf16result;
-unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
-assert (utf16result.size() == 4);
-assert (utf16result[2] == 0xd834);
-assert (utf16result[3] == 0xdd1e);
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
 

This is a quicker but less safe version of utf8::utf8to16. It does not @@ -833,27 +994,34 @@ assert (utf16result[3] == 0xdd1e);

Converts a UTF-32 encoded string to UTF-8.

- template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-32 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-32 encoded string - to convert.
- result: an output iterator to the place in the UTF-8 string where to + end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-8 - string. + Return value: An iterator pointing to the place + after the appended UTF-8 string.

Example of use:

-int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
-vector<unsigned char> utf8result;
-utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
-assert (utf8result.size() == 9);
+int utf32string[] = {0x448, 0x65e5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
 

This is a quicker but less safe version of utf8::utf32to8. It does not @@ -865,27 +1033,32 @@ assert (utf8result.size() == 9);

Converts a UTF-8 encoded string to UTF-32.

- template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator - result); +
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+

start: an iterator pointing to the beginning of the UTF-8 encoded string to convert.
- end: an iterator pointing to pass-the-end of the UTF-8 encoded string + end: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
- result: an output iterator to the place in the UTF-32 string where to + result: an output iterator to the place in the UTF-32 string where to append the result of conversion.
- Return value: An iterator pointing to the place after the appended UTF-32 - string. + Return value: An iterator pointing to the place + after the appended UTF-32 string.

Example of use:

-char* twochars = "\xe6\x97\xa5\xd1\x88";
-vector<int> utf32result;
-unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
-assert (utf32result.size() == 2);
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
 

This is a quicker but less safe version of utf8::utf8to32. It does not diff --git a/source/utf8/checked.h b/source/utf8/checked.h index 980be27..4647016 100644 --- a/source/utf8/checked.h +++ b/source/utf8/checked.h @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 #include "core.h" -#include +#include namespace utf8 { @@ -152,7 +152,18 @@ namespace utf8 return cp; } + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + octet_iterator end = it; + while (internal::is_trail(*(--it))) + if (it < start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return next(temp, end); + } + /// Deprecated in versions that include "prior" template uint32_t previous(octet_iterator& it, octet_iterator pass_start) { @@ -240,37 +251,50 @@ namespace utf8 // The iterator class template class iterator { - static const typename std::iterator_traits::difference_type MAX_UTF8_SEQUENCE_LENGTH = 4; octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; public: - explicit iterator (const octet_iterator& octet_it) : it(octet_it) {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } // the default "big three" are OK uint32_t operator * () const { octet_iterator temp = it; - return next(temp, temp + MAX_UTF8_SEQUENCE_LENGTH); + return next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start && range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); } - bool operator == (const iterator& rhs) const { return (it == rhs.it); } iterator& operator ++ () { - next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + next(it, range_end); return *this; } iterator operator ++ (int) { iterator temp = *this; - next(it, it + MAX_UTF8_SEQUENCE_LENGTH); + next(it, range_end); return temp; } iterator& operator -- () { - previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + prior(it, range_start); return *this; } iterator operator -- (int) { iterator temp = *this; - previous(it, it - MAX_UTF8_SEQUENCE_LENGTH); + prior(it, range_start); return temp; } }; // class iterator diff --git a/source/utf8/unchecked.h b/source/utf8/unchecked.h index 75c882d..ac019d9 100644 --- a/source/utf8/unchecked.h +++ b/source/utf8/unchecked.h @@ -88,13 +88,20 @@ namespace utf8 } template - uint32_t previous(octet_iterator& it) + uint32_t prior(octet_iterator& it) { while (internal::is_trail(*(--it))) ; octet_iterator temp = it; return next(temp); } + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return prior(it); + } + template void advance (octet_iterator& it, distance_type n) { diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp index 77154ed..880aca3 100644 --- a/test_drivers/smoke_test/test.cpp +++ b/test_drivers/smoke_test/test.cpp @@ -22,6 +22,8 @@ int main() end = append(0x10346, u); assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); + + //next char* twochars = "\xe6\x97\xa5\xd1\x88"; char* w = twochars; @@ -41,8 +43,24 @@ int main() assert (cp == 0x0448); assert (w == threechars + 9); + //prior + w = twochars + 3; + cp = prior (w, twochars); + assert (cp == 0x65e5); + assert (w == twochars); - //previous + w = threechars + 9; + cp = prior(w, threechars); + assert (cp == 0x0448); + assert (w == threechars + 7); + cp = prior(w, threechars); + assert (cp == 0x65e5); + assert (w == threechars + 4); + cp = prior(w, threechars); + assert (cp == 0x10346); + assert (w == threechars); + + //previous (deprecated) w = twochars + 3; cp = previous (w, twochars - 1); assert (cp == 0x65e5); @@ -131,19 +149,19 @@ int main() assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); // iterator - utf8::iterator it(threechars); + utf8::iterator it(threechars, threechars, threechars + 9); utf8::iterator it2 = it; assert (it2 == it); assert (*it == 0x10346); assert (*(++it) == 0x65e5); assert ((*it++) == 0x65e5); assert (*it == 0x0448); - utf8::iterator endit (threechars + 9); + utf8::iterator endit (threechars + 9, threechars, threechars + 9); assert (++it == endit); assert (*(--it) == 0x0448); assert ((*it--) == 0x0448); assert (*it == 0x65e5); - assert (--it == utf8::iterator(threechars)); + assert (--it == utf8::iterator(threechars, threechars, threechars + 9)); assert (*it == 0x10346); ////////////////////////////////////////////////////////// @@ -179,7 +197,8 @@ int main() assert (w == threechars + 9); - //previous + //previous (calls prior internally) + w = twochars + 3; cp = unchecked::previous (w); assert (cp == 0x65e5);