diff --git a/boost/boost/utf8.hpp b/boost/boost/utf8.hpp new file mode 100644 index 0000000..2cf4484 --- /dev/null +++ b/boost/boost/utf8.hpp @@ -0,0 +1,530 @@ +// utf8.hpp header file + +/* +Copyright 2006 Nemanja Trifunovic + +Distributed under the Boost Software License, Version 1.0. (See +accompanying file LICENSE_1_0.txt or copy at +http://www.boost.org/LICENSE_1_0.txt) +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +#include + +namespace boost { + +namespace utf8 { + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public std::exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public std::exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public std::exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public std::exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0) + { + uint32_t cp = mask8(*it); + // Check the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + octet_difference_type length = sequence_length(it); + + // "Shortcut" for ASCII characters + if (length == 1) { + if (end - it > 0) { + if (code_point) + *code_point = cp; + ++it; + return OK; + } + else + return NOT_ENOUGH_ROOM; + } + + // Do we have enough memory? + if (end - it < length) + return NOT_ENOUGH_ROOM; + + // Check trail octets and calculate the code point + switch (length) { + case 0: + return INVALID_LEAD; + break; + case 2: + if (is_trail(*(++it))) { + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + } + else { + --it; + return INCOMPLETE_SEQUENCE; + } + break; + case 3: + if (is_trail(*(++it))) { + cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); + if (is_trail(*(++it))) { + cp += (*it) & 0x3f; + } + else { + --it; --it; + return INCOMPLETE_SEQUENCE; + } + } + else { + --it; + return INCOMPLETE_SEQUENCE; + } + break; + case 4: + if (is_trail(*(++it))) { + cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); + if (is_trail(*(++it))) { + cp += (mask8(*it) << 6) & 0xfff; + if (is_trail(*(++it))) { + cp += (*it) & 0x3f; + } + else { + --it; --it; --it; + return INCOMPLETE_SEQUENCE; + } + } + else { + --it; --it; + return INCOMPLETE_SEQUENCE; + } + } + else { + --it; + return INCOMPLETE_SEQUENCE; + } + break; + } + // Is the code point valid? + if (!is_code_point_valid(cp)) { + for (octet_difference_type i = 0; i < length - 1; ++i) + --it; + return INVALID_CODE_POINT; + } + + if (code_point) + *code_point = cp; + + if (cp < 0x80) { + if (length != 1) { + for (octet_difference_type i = 0; i < length - 1; ++i) + --it; + return OVERLONG_SEQUENCE; + } + } + else if (cp < 0x800) { + if (length != 2) { + for (octet_difference_type i = 0; i < length - 1; ++i) + --it; + return OVERLONG_SEQUENCE; + } + } + else if (cp < 0x10000) { + if (length != 3) { + for (octet_difference_type i = 0; i < length - 1; ++i) + --it; + return OVERLONG_SEQUENCE; + } + } + + ++it; + return OK; + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + internal::utf_error err_code = internal::validate_next(result, end); + if (err_code != internal::OK) + return result; + } + return result; + } + + template + bool is_valid(octet_iterator start, octet_iterator end) + { + return (find_invalid(start, end) == end); + } + + template + bool is_bom (octet_iterator it) + { + return ( + (internal::mask8(*it++)) == bom[0] && + (internal::mask8(*it++)) == bom[1] && + (internal::mask8(*it)) == bom[2] + ); + } + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp <= internal::CODE_POINT_MAX) { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast((cp >> 12)& 0x3f | 0x80); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else + throw invalid_code_point(cp); + + return result; + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = internal::validate_next(it, end, &cp); + switch (err_code) { + case internal::OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + octet_iterator end = it; + while (internal::is_trail(*(--it))) + if (it < start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = internal::mask16(*start++); + // Take care of surrogate pairs first + if (internal::is_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = internal::mask16(*start++); + if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(*start)); + + } + result = append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = next(start, end); + + return result; + } + + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast((cp >> 12)& 0x3f | 0x80); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t prior(octet_iterator& it) + { + while (internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return next(temp); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = internal::mask16(*start++); + // Take care of surrogate pairs first + if (internal::is_surrogate(cp)) { + uint32_t trail_surrogate = internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = next(start); + + return result; + } + + } // namespace utf8::unchecked +} // namespace utf8 +} // namespace boost + +#endif // header guard diff --git a/boost/libs/utf8/Jamfile b/boost/libs/utf8/Jamfile new file mode 100644 index 0000000..46d8e73 --- /dev/null +++ b/boost/libs/utf8/Jamfile @@ -0,0 +1,17 @@ +# Boost UTF8 Library test Jamfile + +# Copyright Nemanja Trifunovic 2006 + +# Use, modification, and distribution is subject to the Boost Software +# License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + + +# bring in rules for testing +import testing ; + +{ + test-suite "utf8" + : [ run test.cpp ] + ; +} diff --git a/boost/libs/utf8/index.html b/boost/libs/utf8/index.html new file mode 100644 index 0000000..fe6dee3 --- /dev/null +++ b/boost/libs/utf8/index.html @@ -0,0 +1,1102 @@ + + + + + + + + + Boost UTF8 + + + + +
+

+ Table of Contents +

+ +
+

+ Introduction +

+

+ Many C++ developers miss an easy and portable way of handling Unicode encoded + strings. C++ Standard is currently Unicode agnostic, and while some work is being + done to introduce Unicode to the next incarnation called C++0x, for the moment + nothing of the sort is available. In the meantime, developers use 3rd party + libraries like ICU, OS specific capabilities, or simply roll out their own + solutions. +

+

+ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set + of template functions. For anybody used to work with STL algorithms, they should be + easy and natural to use. The code is freely available for any purpose - check out + the license at the beginning of the utf8.h file. If you run into + bugs or performance issues, please let me know and I'll do my best to address them. +

+

+ The purpose of this article is not to offer an introduction to Unicode in general, + and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out + Unicode Home Page or some other source of + information for Unicode. Also, it is not my aim to advocate the use of UTF-8 + encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from + C++, I am sure you have good reasons for it. +

+

+ Examples of use +

+

+ To illustrate the use of Boost UTF8 library, we shall open a file containing UTF-8 + encoded text, check whether it starts with a byte order mark, read each line into a + std::string, check it for validity, convert the text to UTF-16, and + back to UTF-8: +

+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <boost/utf8.hpp>
+
+using namespace std;
+using namespace boost;
+
+int main()
+{
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
+    ifstream fs8(test_file_path);
+    if (!fs8.is_open()) {
+    cout << "Could not open " << test_file_path << endl;
+    return 0;
+    }
+    // Read the first line of the file
+    unsigned line_count = 1;
+    string line;
+    if (!getline(fs8, line)) 
+        return 0;
+    // Look for utf-8 byte-order mark at the beginning
+    if (line.size() > 2) {
+        if (utf8::is_bom(line.c_str()))
+            cout << "There is a byte order mark at the beginning of the file\n";
+    }
+    // Play with all the lines in the file
+    do {
+       // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+        if (end_it != line.end()) {
+            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+        }
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout << "Length of line " << line_count << " is " << length <<  "\n";
+        // Convert it to utf-16
+        vector<unsigned short> utf16line;
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+        // And back to utf-8
+        string utf8line; 
+        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
+        getline(fs8, line);
+        line_count++;
+    } while (!fs8.eof());
+    return 0;
+}
+
+

+ In the previous code sample, we have seen the use of the following functions from + utf8 namespace: first we used is_bom function to detect + UTF-8 byte order mark at the beginning of the file; then for each line we performed + a detection of invalid UTF-8 sequences with find_invalid; the number + of characters (more precisely - the number of Unicode code points) in each line was + determined with a use of utf8::distance; finally, we have converted + each line to UTF-16 encoding with utf8to16 and back to UTF-8 with + utf16to8. +

+

+ Reference +

+

+ Functions From utf8 Namespace +

+

+ utf8::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

+
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
+
+

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place + after the newly appended sequence. +

+

+ Example of use: +

+
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+

+ Note that append does not allocate any memory - it is the burden of + the caller to make sure there is enough memory allocated for the operation. To make + things more interesting, append can add anywhere between 1 and 4 + octets to the sequence. In practice, you would most often want to use + std::back_inserter to ensure that the necessary memory is allocated. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::next +

+

+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point and moves the iterator to the next position. +

+
+template <typename octet_iterator> 
+uint32_t next(octet_iterator& it, octet_iterator end);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+

+ This function is typically used to iterate through a UTF-8 encoded string. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. +

+

+ utf8::prior +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator> 
+uint32_t prior(octet_iterator& it, octet_iterator start);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ start: an iterator to the beginning of the sequence where the search + for the beginning of a code point is performed. It is a + safety measure to prevent passing the beginning of the string in the search for a + UTF-8 lead octet.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = prior (w, twochars);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+

+ This function has two purposes: one is two iterate backwards through a UTF-8 + encoded string. Note that it is usually a better idea to iterate forward instead, + since utf8::next is faster. The second purpose is to find a beginning + of a UTF-8 sequence if we have a random position within a string. +

+

+ it will typically point to the beginning of + a code point, and start will point to the + beginning of the string to ensure we don't go backwards too far. it is + decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence + beginning with that octet is decoded to a 32 bit representation and returned. +

+

+ In case pass_end is reached before a UTF-8 lead octet is hit, or if an + invalid UTF-8 sequence is started by the lead octet, an invalid_utf8 + exception is thrown. +

+

+ utf8::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

+
+template <typename octet_iterator, typename distance_type> 
+void advance (octet_iterator& it, distance_type n, octet_iterator end);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
+advance (w, 2, twochars + 6);
+assert (w == twochars + 5);
+
+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ In case of an invalid code point, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

+
+template <typename octet_iterator> 
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+   
+
+

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::distance(twochars, twochars + 5);
+assert (dist == 2);
+
+

+ This function is used to find the length (in code points) of a UTF-8 encoded + string. The reason it is called distance, rather than, say, + length is mainly because developers are used that length is an + O(1) function. Computing the length of an UTF-8 string is a linear operation, and + it looked better to model it after std::distance algorithm. +

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If last does not point to the past-of-end of a UTF-8 seqence, + a utf8::not_enough_room exception is thrown. +

+

+ utf8::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

+
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
+
+

+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16 exception is + thrown. +

+

+ utf8::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

+
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-16 string. +

+

+ Example of use: +

+
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+

+ In case of invalid UTF-32 string, a utf8::invalid_code_point exception + is thrown. +

+

+ utf8::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-32 string. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. If end does not point to the past-of-end of a UTF-8 seqence, a + utf8::not_enough_room exception is thrown. +

+

+ utf8::find_invalid +

+

+ Detects an invalid sequence within a UTF-8 string. +

+
+template <typename octet_iterator> 
+octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+
+

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: an iterator pointing to the first + invalid octet in the UTF-8 string. In case none were found, equals + end. +

+

+ Example of use: +

+
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+assert (invalid == utf_invalid + 5);
+
+

+ This function is typically used to make sure a UTF-8 string is valid before + processing it with other functions. It is especially important to call it if before + doing any of the unchecked operations on it. +

+

+ utf8::is_valid +

+

+ Checks whether a sequence of octets is a valid UTF-8 string. +

+
+template <typename octet_iterator> 
+bool is_valid(octet_iterator start, octet_iterator end);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 string to + test for validity.
+ end: an iterator pointing to pass-the-end of the UTF-8 string to test + for validity.
+ Return value: true if the sequence + is a valid UTF-8 string; false if not. +

+ Example of use: +
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+assert (bvalid == false);
+
+

+ is_valid is a shorthand for find_invalid(start, end) == + end;. You may want to use it to make sure that a byte seqence is a valid + UTF-8 string without the need to know where it fails if it is not valid. +

+

+ utf8::is_bom +

+

+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) +

+
+template <typename octet_iterator> 
+bool is_bom (octet_iterator it);
+
+

+ it: beginning of the 3-octet sequence to check
+ Return value: true if the sequence + is UTF-8 byte order mark; false if not. +

+

+ Example of use: +

+
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
+assert (bbom == true);
+
+

+ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +

+

+ Functions From utf8::unchecked Namespace +

+

+ utf8::unchecked::append +

+

+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence + to a UTF-8 string. +

+
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+   
+
+

+ cp: A 32 bit integer representing a code point to append to the + sequence.
+ result: An output iterator to the place in the sequence where to + append the code point.
+ Return value: An iterator pointing to the place + after the newly appended sequence. +

+

+ Example of use: +

+
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+

+ This is a faster but less safe version of utf8::append. It does not + check for validity of the supplied code point, and may produce an invalid UTF-8 + sequence. +

+

+ utf8::unchecked::next +

+

+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point + and moves the iterator to the next position. +

+
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + beginning of the next code point.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+

+ This is a faster but less safe version of utf8::next. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::prior +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator>
+uint32_t prior(octet_iterator& it);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::prior (w);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+

+ This is a faster but less safe version of utf8::prior. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::advance +

+

+ Advances an iterator by the specified number of code points within an UTF-8 + sequence. +

+
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n);
+   
+
+

+ it: a reference to an iterator pointing to the beginning of an UTF-8 + encoded code point. After the function returns, it is incremented to point to the + nth following code point.
+ n: a positive integer that shows how many code points we want to + advance.
+

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+unchecked::advance (w, 2);
+assert (w == twochars + 5);
+
+

+ This function works only "forward". In case of a negative n, there is + no effect. +

+

+ This is a faster but less safe version of utf8::advance. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::distance +

+

+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the + number of code points between them. +

+
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+
+

+ first: an iterator to a beginning of a UTF-8 encoded code point.
+ last: an iterator to a "post-end" of the last UTF-8 encoded code + point in the sequence we are trying to determine the length. It can be the + beginning of a new code point, or not.
+ Return value the distance between the iterators, + in code points. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
+assert (dist == 2);
+
+

+ This is a faster but less safe version of utf8::distance. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf16to8 +

+

+ Converts a UTF-16 encoded string to UTF-8. +

+
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-16 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-16 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);    
+
+

+ This is a faster but less safe version of utf8::utf16to8. It does not + check for validity of the supplied UTF-16 sequence. +

+

+ utf8::unchecked::utf8to16 +

+

+ Converts an UTF-8 encoded string to UTF-16 +

+
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert. < br /> end: an iterator pointing to + pass-the-end of the UTF-8 encoded string to convert.
+ result: an output iterator to the place in the UTF-16 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-16 string. +

+

+ Example of use: +

+
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+

+ This is a faster but less safe version of utf8::utf8to16. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ utf8::unchecked::utf32to8 +

+

+ Converts a UTF-32 encoded string to UTF-8. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-32 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-32 encoded + string to convert.
+ result: an output iterator to the place in the UTF-8 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-8 string. +

+

+ Example of use: +

+
+int utf32string[] = {0x448, 0x65e5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+

+ This is a faster but less safe version of utf8::utf32to8. It does not + check for validity of the supplied UTF-32 sequence. +

+

+ utf8::unchecked::utf8to32 +

+

+ Converts a UTF-8 encoded string to UTF-32. +

+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+   
+
+

+ start: an iterator pointing to the beginning of the UTF-8 encoded + string to convert.
+ end: an iterator pointing to pass-the-end of the UTF-8 encoded string + to convert.
+ result: an output iterator to the place in the UTF-32 string where to + append the result of conversion.
+ Return value: An iterator pointing to the place + after the appended UTF-32 string. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+

+ This is a faster but less safe version of utf8::utf8to32. It does not + check for validity of the supplied UTF-8 sequence. +

+

+ Points of interest +

+

+ Design goals and decisions +

+

+ The library was designed to be: +

+
    +
  1. + Generic: for better or worse, there are many C++ string classes out there, and + the library should work with as many of them as possible. +
  2. +
  3. + Portable: the library should be portable both accross different platforms and + compilers. The only non-portable code is a small section that declares unsigned + integers of different sizes: three typedefs. They can be changed by the users of + the library if they don't match their platform. The default setting should work + for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. +
  4. +
  5. + Lightweight: follow the "pay only for what you use" guidline. +
  6. +
  7. + Unintrusive: avoid forcing any particular design or even programming style on the + user. This is a library, not a framework. +
  8. +
+

+ Alternatives +

+

+ In case you want to look into other means of working with UTF-8 strings from C++, + here is the list of solutions I am aware of: +

+
    +
  1. + ICU Library. It is very powerful, + complete, feature-rich, mature, and widely used. Also big, intrusive, + non-generic, and doesn't play well with the Standard Library. I definitelly + recommend looking at ICU even if you don't plan to use it. +
  2. +
  3. + Glib::ustring. + A class specifically made to work with UTF-8 strings, and also feel like + std::string. If you prefer to have yet another string class in your + code, it may be worth a look. Be aware of the licensing issues, though. +
  4. +
  5. + Platform dependent solutions: Windows and POSIX have functions to convert strings + from one encoding to another. That is only a subset of what my library offers, + but if that is all you need it may be good enough, especially given the fact that + these functions are mature and tested in production. +
  6. +
+

+ Conclusion +

+

+ Until Unicode becomes officially recognized by the C++ Standard Library, we need to + use other means to work with UTF-8 strings. Template functions I describe in this + article may be a good step in this direction. +

+

+ References +

+
    +
  1. + The Unicode Consortium. +
  2. +
  3. + ICU Library. +
  4. +
  5. + UTF-8 at Wikipedia +
  6. +
  7. + UTF-8 and Unicode FAQ for + Unix/Linux +
  8. +
+ + diff --git a/boost/libs/utf8/test.cpp b/boost/libs/utf8/test.cpp new file mode 100644 index 0000000..f9fc1a6 --- /dev/null +++ b/boost/libs/utf8/test.cpp @@ -0,0 +1,220 @@ +#include +#include +#include "boost/utf8.hpp" +#include +using namespace boost::utf8; +using namespace std; + +int main() +{ + //append + unsigned char u[5] = {0,0,0,0,0}; + + unsigned char* end = append(0x0448, u); + BOOST_TEST (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); + + end = append(0x65e5, u); + BOOST_TEST (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); + + end = append(0x3044, u); + BOOST_TEST (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0); + + end = append(0x10346, u); + BOOST_TEST (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); + + //next + char* twochars = "\xe6\x97\xa5\xd1\x88"; + char* w = twochars; + int cp = next(w, twochars + 6); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == twochars + 3); + + char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + cp = next(w, threechars + 9); + BOOST_TEST (cp == 0x10346); + BOOST_TEST (w == threechars + 4); + cp = next(w, threechars + 9); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == threechars + 7); + cp = next(w, threechars + 9); + BOOST_TEST (cp == 0x0448); + BOOST_TEST (w == threechars + 9); + + + //prior + w = twochars + 3; + cp = prior (w, twochars - 1); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == twochars); + + w = threechars + 9; + cp = prior(w, threechars - 1); + BOOST_TEST (cp == 0x0448); + BOOST_TEST (w == threechars + 7); + cp = prior(w, threechars -1); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == threechars + 4); + cp = prior(w, threechars - 1); + BOOST_TEST (cp == 0x10346); + BOOST_TEST (w == threechars); + + // advance + w = twochars; + advance (w, 2, twochars + 6); + BOOST_TEST (w == twochars + 5); + + // distance + size_t dist = boost::utf8::distance(twochars, twochars + 5); + BOOST_TEST (dist == 2); + + // utf32to8 + int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + vector utf8result; + utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + BOOST_TEST (utf8result.size() == 9); + // try it with the return value; + char* utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]); + BOOST_TEST (utf8_end == &utf8result[0] + 9); + + //utf8to32 + vector utf32result; + utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + BOOST_TEST (utf32result.size() == 2); + // try it with the return value; + int* utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]); + BOOST_TEST (utf32_end == &utf32result[0] + 2); + + //utf16to8 + unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + utf8result.clear(); + utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + BOOST_TEST (utf8result.size() == 10); + // try it with the return value; + utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]); + BOOST_TEST (utf8_end == &utf8result[0] + 10); + + //utf8to16 + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector utf16result; + utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + BOOST_TEST (utf16result.size() == 4); + BOOST_TEST (utf16result[2] == 0xd834); + BOOST_TEST (utf16result[3] == 0xdd1e); + // try it with the return value; + unsigned short* utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]); + BOOST_TEST (utf16_end == &utf16result[0] + 4); + + //find_invalid + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + BOOST_TEST (invalid == utf_invalid + 5); + + //is_valid + bool bvalid = is_valid(utf_invalid, utf_invalid + 6); + BOOST_TEST (bvalid == false); + bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); + BOOST_TEST (bvalid == true); + + //is_bom + unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; + bool bbom = is_bom(byte_order_mark); + BOOST_TEST (bbom == true); + + ////////////////////////////////////////////////////////// + //// Unchecked variants + ////////////////////////////////////////////////////////// + + //append + memset(u, 0, 5); + end = unchecked::append(0x0448, u); + BOOST_TEST (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); + + end = unchecked::append(0x65e5, u); + BOOST_TEST (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); + + end = unchecked::append(0x10346, u); + BOOST_TEST (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); + + //next + w = twochars; + cp = unchecked::next(w); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == twochars + 3); + + w = threechars; + cp = unchecked::next(w); + BOOST_TEST (cp == 0x10346); + BOOST_TEST (w == threechars + 4); + cp = unchecked::next(w); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == threechars + 7); + cp = unchecked::next(w); + BOOST_TEST (cp == 0x0448); + BOOST_TEST (w == threechars + 9); + + + //prior + w = twochars + 3; + cp = unchecked::prior (w); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == twochars); + + w = threechars + 9; + cp = unchecked::prior(w); + BOOST_TEST (cp == 0x0448); + BOOST_TEST (w == threechars + 7); + cp = unchecked::prior(w); + BOOST_TEST (cp == 0x65e5); + BOOST_TEST (w == threechars + 4); + cp = unchecked::prior(w); + BOOST_TEST (cp == 0x10346); + BOOST_TEST (w == threechars); + + // advance + w = twochars; + unchecked::advance (w, 2); + BOOST_TEST (w == twochars + 5); + + // distance + dist = unchecked::distance(twochars, twochars + 5); + BOOST_TEST (dist == 2); + + // utf32to8 + utf8result.clear(); + unchecked::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + BOOST_TEST (utf8result.size() == 9); + // try it with the return value; + utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]); + BOOST_TEST(utf8_end == &utf8result[0] + 9); + + //utf8to32 + utf32result.clear(); + unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + BOOST_TEST (utf32result.size() == 2); + // try it with the return value; + utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]); + BOOST_TEST (utf32_end == &utf32result[0] + 2); + + //utf16to8 + utf8result.clear(); + unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + BOOST_TEST (utf8result.size() == 10); + // try it with the return value; + utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]); + BOOST_TEST (utf8_end == &utf8result[0] + 10); + + //utf8to16 + utf16result.clear(); + unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + BOOST_TEST (utf16result.size() == 4); + BOOST_TEST (utf16result[2] == 0xd834); + BOOST_TEST (utf16result[3] == 0xdd1e); + // try it with the return value; + utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]); + BOOST_TEST (utf16_end == &utf16result[0] + 4); + + return boost::report_errors(); +} + + diff --git a/v1_0/doc/utf8cpp.html b/v1_0/doc/utf8cpp.html index 052dad2..caf5253 100644 --- a/v1_0/doc/utf8cpp.html +++ b/v1_0/doc/utf8cpp.html @@ -133,7 +133,7 @@ } // Play with all the lines in the file do { - // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); if (end_it != line.end()) { cout << ::difference_type octet_differece_type; - octet_differece_type length = sequence_length(it); + typedef typename std::iterator_traits::difference_type octet_difference_type; + octet_difference_type length = sequence_length(it); // "Shortcut" for ASCII characters if (length == 1) { @@ -215,7 +215,7 @@ namespace internal } // Is the code point valid? if (!is_code_point_valid(cp)) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return INVALID_CODE_POINT; } @@ -225,21 +225,21 @@ namespace internal if (cp < 0x80) { if (length != 1) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; } } else if (cp < 0x800) { if (length != 2) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; } } else if (cp < 0x10000) { if (length != 3) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; } diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html index 1228c49..a177186 100644 --- a/v2_0/doc/utf8cpp.html +++ b/v2_0/doc/utf8cpp.html @@ -80,8 +80,7 @@ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set of template functions. For anybody used to work with STL algorithms, they should be easy and natural to use. The code is freely available for any purpose - check out - the license at the beginning of the utf8.h file. Be aware, though, that while I did - some testing, this library has not been used in production yet. If you run into + the license at the beginning of the utf8.h file. If you run into bugs or performance issues, please let me know and I'll do my best to address them.

@@ -134,7 +133,7 @@ } // Play with all the lines in the file do { - // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); if (end_it != line.end()) { cout << 3); thrown.

- utf8::previous + utf8::prior +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator> 
+uint32_t prior(octet_iterator& it, octet_iterator start);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ start: an iterator to the beginning of the sequence where the search + for the beginning of a code point is performed. It is a + safety measure to prevent passing the beginning of the string in the search for a + UTF-8 lead octet.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = prior (w, twochars);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+

+ This function has two purposes: one is two iterate backwards through a UTF-8 + encoded string. Note that it is usually a better idea to iterate forward instead, + since utf8::next is faster. The second purpose is to find a beginning + of a UTF-8 sequence if we have a random position within a string. +

+

+ it will typically point to the beginning of + a code point, and start will point to the + beginning of the string to ensure we don't go backwards too far. it is + decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence + beginning with that octet is decoded to a 32 bit representation and returned. +

+

+ In case pass_end is reached before a UTF-8 lead octet is hit, or if an + invalid UTF-8 sequence is started by the lead octet, an invalid_utf8 + exception is thrown. +

+

+ utf8::previous (deprecated, see utf8::prior)

Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it @@ -310,8 +364,14 @@ assert (cp == 0x65e5); assert (w == twochars);

- The primary purpose of this function is to iterate backwards through a UTF-8 - encoded string. Therefore, it will typically point to the beginning of + utf8::previous is deprecated, and utf8::prior should + be used instead, although the existing code can continue using this function. + The problem is the parameter pass_start that points to the position + just before the beginning of the sequence. Standard containers don't have the + concept of "pass start" and the function can not be used with their iterators. +

+

+ it will typically point to the beginning of a code point, and pass_start will point to the octet just before the beginning of the string to ensure we don't go backwards too far. it is decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence @@ -760,7 +820,7 @@ assert (u[0] == 0);

- This is a quicker but less safe version of utf8::append. It does not + This is a faster but less safe version of utf8::append. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence.

@@ -796,11 +856,47 @@ assert (cp == 0x65e5); assert (w == twochars + 3);

- This is a quicker but less safe version of utf8::next. It does not + This is a faster but less safe version of utf8::next. It does not check for validity of the supplied UTF-8 sequence.

- utf8::unchecked::previous + utf8::unchecked::prior +

+

+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + decreases the iterator until it hits the beginning of the previous UTF-8 encoded + code point and returns the 32 bits representation of the code point. +

+
+template <typename octet_iterator>
+uint32_t prior(octet_iterator& it);
+   
+
+

+ it: a reference pointing to an octet within a UTF-8 encoded string. + After the function returns, it is decremented to point to the beginning of the + previous code point.
+ Return value: the 32 bit representation of the + previous code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::prior (w);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+

+ This is a faster but less safe version of utf8::prior. It does not + check for validity of the supplied UTF-8 sequence and offers no boundary checking. +

+

+ utf8::unchecked::previous (deprecated, see utf8::unchecked::prior)

Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it @@ -832,7 +928,13 @@ assert (cp == 0x65e5); assert (w == twochars);

- This is a quicker but less safe version of utf8::previous. It does not + The reason this function is deprecated is just the consistency with the "checked" + versions, where prior should be used instead of previous. + In fact, unchecked::previous behaves exactly the same as + unchecked::prior +

+

+ This is a faster but less safe version of utf8::previous. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

@@ -870,7 +972,7 @@ assert (w == twochars + 5); no effect.

- This is a quicker but less safe version of utf8::advance. It does not + This is a faster but less safe version of utf8::advance. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

@@ -905,7 +1007,7 @@ size_t dist = utf8::unchecked::distance(twochars, twochars + 2);

- This is a quicker but less safe version of utf8::distance. It does not + This is a faster but less safe version of utf8::distance. It does not check for validity of the supplied UTF-8 sequence.

@@ -945,7 +1047,7 @@ unchecked::utf16to8(utf16string, utf16string + 10);

- This is a quicker but less safe version of utf8::utf16to8. It does not + This is a faster but less safe version of utf8::utf16to8. It does not check for validity of the supplied UTF-16 sequence.

@@ -985,7 +1087,7 @@ assert (utf16result[3] == 0xdd1e);

- This is a quicker but less safe version of utf8::utf8to16. It does not + This is a faster but less safe version of utf8::utf8to16. It does not check for validity of the supplied UTF-8 sequence.

@@ -1024,7 +1126,7 @@ utf32to8(utf32string, utf32string + 9);

- This is a quicker but less safe version of utf8::utf32to8. It does not + This is a faster but less safe version of utf8::utf32to8. It does not check for validity of the supplied UTF-32 sequence.

@@ -1061,7 +1163,7 @@ unchecked::utf8to32(twochars, twochars + 2);

- This is a quicker but less safe version of utf8::utf8to32. It does not + This is a faster but less safe version of utf8::utf8to32. It does not check for validity of the supplied UTF-8 sequence.

diff --git a/v2_0/source/utf8/core.h b/v2_0/source/utf8/core.h index 76491f6..11e7322 100644 --- a/v2_0/source/utf8/core.h +++ b/v2_0/source/utf8/core.h @@ -107,8 +107,8 @@ namespace internal { uint32_t cp = mask8(*it); // Check the lead octet - typedef typename std::iterator_traits::difference_type octet_differece_type; - octet_differece_type length = sequence_length(it); + typedef typename std::iterator_traits::difference_type octet_difference_type; + octet_difference_type length = sequence_length(it); // "Shortcut" for ASCII characters if (length == 1) { @@ -182,7 +182,7 @@ namespace internal } // Is the code point valid? if (!is_code_point_valid(cp)) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return INVALID_CODE_POINT; } @@ -192,21 +192,21 @@ namespace internal if (cp < 0x80) { if (length != 1) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; } } else if (cp < 0x800) { if (length != 2) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; } } else if (cp < 0x10000) { if (length != 3) { - for (octet_differece_type i = 0; i < length - 1; ++i) + for (octet_difference_type i = 0; i < length - 1; ++i) --it; return OVERLONG_SEQUENCE; }