From da02583fea896d51cd05d5a2610ca118a7d2c85c Mon Sep 17 00:00:00 2001
From: ntrifunovic <ntrifunovic@a809a056-fc17-0410-9590-b4f493f8b08e>
Date: Mon, 31 Jul 2006 22:04:46 +0000
Subject: [PATCH] Fix for the bug [ 1531740 ] utf8::append does not work
 correctly for some code points.

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@32 a809a056-fc17-0410-9590-b4f493f8b08e
---
 source/utf8.h                                 | 42 ++++-----
 .../r1_0Beta2/basic_functionality.h           | 11 +++
 .../regression_tests/reg_tests_driver.cpp     | 11 +++
 test_drivers/smoke_test/test.cpp              |  3 +
 test_drivers/utf8reader/utf8reader.cpp        | 92 ++++++++++++++++++-
 5 files changed, 136 insertions(+), 23 deletions(-)
 create mode 100644 test_drivers/regression_tests/r1_0Beta2/basic_functionality.h
diff --git a/source/utf8.h b/source/utf8.h
index 729d151..ad14908 100644
--- a/source/utf8.h
+++ b/source/utf8.h
@@ -263,25 +263,25 @@ namespace internal
     template <typename octet_iterator>
     octet_iterator append(uint32_t cp, octet_iterator result)
     {
+        if (!internal::is_code_point_valid(cp)) 
+            throw invalid_code_point(cp);
+
         if (cp < 0x80)                        // one octet
             *(result++) = static_cast<uint8_t>(cp);  
         else if (cp < 0x800) {                // two octets
-            if (!internal::is_code_point_valid(cp)) 
-                throw invalid_code_point(cp);
-
-            *(result++) = static_cast<uint8_t>((cp >> 6)   | 0xc0);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
         }
         else if (cp < 0x10000) {              // three octets
-            *(result++) = static_cast<uint8_t>((cp >> 12)  | 0xe0);
-            *(result++) = static_cast<uint8_t>((cp >> 6)   | 0x80);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+            *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
         }
         else if (cp <= internal::CODE_POINT_MAX) {      // four octets
-            *(result++) = static_cast<uint8_t>((cp >> 18)  | 0xf0);
-            *(result++) = static_cast<uint8_t>((cp >> 12)  | 0x80);
-            *(result++) = static_cast<uint8_t>((cp >> 6)   | 0x80);
-            *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+            *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f     | 0x80);
+            *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
         }
         else
             throw invalid_code_point(cp);
@@ -396,19 +396,19 @@ namespace internal
             if (cp < 0x80)                        // one octet
                 *(result++) = static_cast<uint8_t>(cp);  
             else if (cp < 0x800) {                // two octets
-                *(result++) = static_cast<uint8_t>((cp >> 6)   | 0xc0);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
             }
             else if (cp < 0x10000) {              // three octets
-                *(result++) = static_cast<uint8_t>((cp >> 12)  | 0xe0);
-                *(result++) = static_cast<uint8_t>((cp >> 6)   | 0x80);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
+                *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
             }
             else {                                // four octets
-                *(result++) = static_cast<uint8_t>((cp >> 18)  | 0xf0);
-                *(result++) = static_cast<uint8_t>((cp >> 12)  | 0x80);
-                *(result++) = static_cast<uint8_t>((cp >> 6)   | 0x80);
-                *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
+                *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f   | 0x80);
+                *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f   | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
             }
             return result;
         }
diff --git a/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h b/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h
new file mode 100644
index 0000000..e5b65c5
--- /dev/null
+++ b/test_drivers/regression_tests/r1_0Beta2/basic_functionality.h
@@ -0,0 +1,11 @@
+#include "../../../source/utf8.h"
+using namespace utf8;
+
+// [ 1531740 ] utf8::append does not work correctly for some code points.
+void id_1531740()
+{
+    unsigned cp_u3044 = 0x3044U;
+    unsigned char u3044[] = {0x0, 0x0, 0x0, 0x0};
+    append(cp_u3044, u3044);
+    check (u3044[0] == 0xe3 && u3044[1] == 0x81 && u3044[2] == 0x84 && u3044[3] == 0);
+}
diff --git a/test_drivers/regression_tests/reg_tests_driver.cpp b/test_drivers/regression_tests/reg_tests_driver.cpp
index c7cb697..c6c7fe5 100644
--- a/test_drivers/regression_tests/reg_tests_driver.cpp
+++ b/test_drivers/regression_tests/reg_tests_driver.cpp
@@ -10,15 +10,26 @@ inline void check_impl (bool condition, const char* file, int line)
 
 #define check(c) check_impl(c, __FILE__, __LINE__);
 
+// Release 1.0 Beta 1
 #include "r1_0Beta1/invalidutf8.h"
 #include "r1_0Beta1/basic_functionality.h"
 
+// Release 1.0 Beta 2
+#include "r1_0Beta2/basic_functionality.h"
+
+
 int main()
 {
+// Release 1.0 Beta 1
 //r1_0Beta1/invalidutf8.h
   id_1524459();
   id_1525236();  
   id_1528369();
 //r1_0Beta1/basic_functionality.h
   id_1528544();
+
+// Release 1.0 Beta 2
+//r1_0Beta2/basic_functionality.h
+  id_1531740();
+
 }
diff --git a/test_drivers/smoke_test/test.cpp b/test_drivers/smoke_test/test.cpp
index aee6b0c..c0eb8d3 100644
--- a/test_drivers/smoke_test/test.cpp
+++ b/test_drivers/smoke_test/test.cpp
@@ -17,6 +17,9 @@ int main()
     end = append(0x65e5, u);
     assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
 
+    end = append(0x3044, u);
+    assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
+
     end = append(0x10346, u);
     assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
 
diff --git a/test_drivers/utf8reader/utf8reader.cpp b/test_drivers/utf8reader/utf8reader.cpp
index e575f39..632b24c 100644
--- a/test_drivers/utf8reader/utf8reader.cpp
+++ b/test_drivers/utf8reader/utf8reader.cpp
@@ -29,8 +29,6 @@ int main(int argc, char** argv)
         cout << "Could not open utf16.txt" << endl;
         return 0;
     }  
-    const unsigned short utf16_bom = 0xfeff;
-    fs16.write(reinterpret_cast<const char*>(&utf16_bom), sizeof(unsigned short));
 
     // Read it line by line
     unsigned int line_count = 0;
@@ -53,6 +51,96 @@ int main(int argc, char** argv)
         utf8to16(line_start, line_end, back_inserter(utf16_line));
         utf16_line.push_back('\n');
         fs16.write(reinterpret_cast<const char*>(&utf16_line[0]), utf16_line.size() * sizeof (unsigned short));
+        utf16_line.pop_back(); // get rid of '\n'
+
+        // Back to utf-8 and compare it to the original line.
+        string back_to_utf8;
+        utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line;
+        utf8to32(line_start, line_end, back_inserter(utf32_line));
+        back_to_utf8.clear();
+        utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        unsigned char_count = 0;
+        string::iterator it = line_start;
+        while (it != line_end) {
+            unsigned cp = next(it, line_end);
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n';
+
+        string::iterator adv_it = line_start;
+        utf8::advance(adv_it, char_count, line_end);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in advance function" << '\n';
+
+        if (utf8::distance(line_start, line_end) != char_count)
+            cout << "Line " << line_count << ": Error in distance function" << '\n';
+
+        while (it != line_start) {
+            unsigned cp = previous(it, line.rend().base());
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
+
+        //======================== Now, the unchecked versions ======================
+        // Convert it to utf-16 and compare to the checked version
+        vector<unsigned short> utf16_line_unchecked;
+        unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked));
+
+        if (utf16_line != utf16_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n';
+
+        // Back to utf-8 and compare it to the original line.
+        back_to_utf8.clear();
+        unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line_unchecked;
+        unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked));
+        if (utf32_line != utf32_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n';
+
+        back_to_utf8.clear();
+        unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        char_count = 0;
+        it = line_start;
+        while (it != line_end) {
+            unsigned cp = unchecked::next(it);
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n';
+
+        adv_it = line_start;
+        utf8::unchecked::advance(adv_it, char_count);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n';
+
+        if (utf8::unchecked::distance(line_start, line_end) != char_count)
+            cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n';
+
+        while (it != line_start) {
+            unsigned cp = unchecked::previous(it);
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
 
     }
 }