From da02583fea896d51cd05d5a2610ca118a7d2c85c Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Mon, 31 Jul 2006 22:04:46 +0000 Subject: [PATCH] Fix for the bug [ 1531740 ] utf8::append does not work correctly for some code points. git-svn-id: http://svn.code.sf.net/p/utfcpp/code@32 a809a056-fc17-0410-9590-b4f493f8b08e --- source/utf8.h | 42 ++++----- .../r1_0Beta2/basic_functionality.h | 11 +++ .../regression_tests/reg_tests_driver.cpp | 11 +++ test_drivers/smoke_test/test.cpp | 3 + test_drivers/utf8reader/utf8reader.cpp | 92 ++++++++++++++++++- 5 files changed, 136 insertions(+), 23 deletions(-) create mode 100644 test_drivers/regression_tests/r1_0Beta2/basic_functionality.h diff --git a/source/utf8.h b/source/utf8.h index 729d151..ad14908 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -263,25 +263,25 @@ namespace internal template octet_iterator append(uint32_t cp, octet_iterator result) { + if (!internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + if (cp < 0x80) // one octet *(result++) = static_cast(cp); else if (cp < 0x800) { // two octets - if (!internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast((cp >> 6) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp <= internal::CODE_POINT_MAX) { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast((cp >> 12) | 0x80); - *(result++) = static_cast((cp >> 6) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast((cp >> 12)& 0x3f | 0x80); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); } else throw invalid_code_point(cp); @@ -396,19 +396,19 @@ namespace internal if (cp < 0x80) // one octet *(result++) = static_cast(cp); else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); } else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast((cp >> 6) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); } else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast((cp >> 12) | 0x80); - *(result++) = static_cast((cp >> 6) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast((cp >> 12)& 0x3f | 0x80); + *(result++) = static_cast((cp >> 6) & 0x3f | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); } return result; } diff --git a/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h b/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h new file mode 100644 index 0000000..e5b65c5 --- /dev/null +++ b/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h @@ -0,0 +1,11 @@ +#include "../../../source/utf8.h" +using namespace utf8; + +// [ 1531740 ] utf8::append does not work correctly for some code points. +void id_1531740() +{ + unsigned cp_u3044 = 0x3044U; + unsigned char u3044[] = {0x0, 0x0, 0x0, 0x0}; + append(cp_u3044, u3044); + check (u3044[0] == 0xe3 && u3044[1] == 0x81 && u3044[2] == 0x84 && u3044[3] == 0); +} diff --git a/test_drivers/regression_tests/reg_tests_driver.cpp b/test_drivers/regression_tests/reg_tests_driver.cpp index c7cb697..c6c7fe5 100644 --- a/test_drivers/regression_tests/reg_tests_driver.cpp +++ b/test_drivers/regression_tests/reg_tests_driver.cpp @@ -10,15 +10,26 @@ inline void check_impl (bool condition, const char* file, int line) #define check(c) check_impl(c, __FILE__, __LINE__); +// Release 1.0 Beta 1 #include "r1_0Beta1/invalidutf8.h" #include "r1_0Beta1/basic_functionality.h" +// Release 1.0 Beta 2 +#include "r1_0Beta2/basic_functionality.h" + + int main() { +// Release 1.0 Beta 1 //r1_0Beta1/invalidutf8.h id_1524459(); id_1525236(); id_1528369(); //r1_0Beta1/basic_functionality.h id_1528544(); + +// Release 1.0 Beta 2 +//r1_0Beta2/basic_functionality.h + id_1531740(); + } diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp index aee6b0c..c0eb8d3 100644 --- a/test_drivers/smoke_test/test.cpp +++ b/test_drivers/smoke_test/test.cpp @@ -17,6 +17,9 @@ int main() end = append(0x65e5, u); assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); + end = append(0x3044, u); + assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0); + end = append(0x10346, u); assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); diff --git a/test_drivers/utf8reader/utf8reader.cpp b/test_drivers/utf8reader/utf8reader.cpp index e575f39..632b24c 100644 --- a/test_drivers/utf8reader/utf8reader.cpp +++ b/test_drivers/utf8reader/utf8reader.cpp @@ -29,8 +29,6 @@ int main(int argc, char** argv) cout << "Could not open utf16.txt" << endl; return 0; } - const unsigned short utf16_bom = 0xfeff; - fs16.write(reinterpret_cast(&utf16_bom), sizeof(unsigned short)); // Read it line by line unsigned int line_count = 0; @@ -53,6 +51,96 @@ int main(int argc, char** argv) utf8to16(line_start, line_end, back_inserter(utf16_line)); utf16_line.push_back('\n'); fs16.write(reinterpret_cast(&utf16_line[0]), utf16_line.size() * sizeof (unsigned short)); + utf16_line.pop_back(); // get rid of '\n' + + // Back to utf-8 and compare it to the original line. + string back_to_utf8; + utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8)); + if (back_to_utf8.compare(string(line_start, line_end)) != 0) + cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n'; + + // Now, convert it to utf-32, back to utf-8 and compare + vector utf32_line; + utf8to32(line_start, line_end, back_inserter(utf32_line)); + back_to_utf8.clear(); + utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8)); + if (back_to_utf8.compare(string(line_start, line_end)) != 0) + cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n'; + + // Now, iterate and back + unsigned char_count = 0; + string::iterator it = line_start; + while (it != line_end) { + unsigned cp = next(it, line_end); + char_count++; + } + if (char_count != utf32_line.size()) + cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n'; + + string::iterator adv_it = line_start; + utf8::advance(adv_it, char_count, line_end); + if (adv_it != line_end) + cout << "Line " << line_count << ": Error in advance function" << '\n'; + + if (utf8::distance(line_start, line_end) != char_count) + cout << "Line " << line_count << ": Error in distance function" << '\n'; + + while (it != line_start) { + unsigned cp = previous(it, line.rend().base()); + char_count--; + } + if (char_count != 0) + cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n'; + + //======================== Now, the unchecked versions ====================== + // Convert it to utf-16 and compare to the checked version + vector utf16_line_unchecked; + unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked)); + + if (utf16_line != utf16_line_unchecked) + cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n'; + + // Back to utf-8 and compare it to the original line. + back_to_utf8.clear(); + unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8)); + if (back_to_utf8.compare(string(line_start, line_end)) != 0) + cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n'; + + // Now, convert it to utf-32, back to utf-8 and compare + vector utf32_line_unchecked; + unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked)); + if (utf32_line != utf32_line_unchecked) + cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n'; + + back_to_utf8.clear(); + unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8)); + if (back_to_utf8.compare(string(line_start, line_end)) != 0) + cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n'; + + // Now, iterate and back + char_count = 0; + it = line_start; + while (it != line_end) { + unsigned cp = unchecked::next(it); + char_count++; + } + if (char_count != utf32_line.size()) + cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n'; + + adv_it = line_start; + utf8::unchecked::advance(adv_it, char_count); + if (adv_it != line_end) + cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n'; + + if (utf8::unchecked::distance(line_start, line_end) != char_count) + cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n'; + + while (it != line_start) { + unsigned cp = unchecked::previous(it); + char_count--; + } + if (char_count != 0) + cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n'; } }