Fix for the bug [ 1531740 ] utf8::append does not work correctly for some code points.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@32 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
daa8483afa
commit
da02583fea
5 changed files with 136 additions and 23 deletions
|
@ -263,25 +263,25 @@ namespace internal
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
octet_iterator append(uint32_t cp, octet_iterator result)
|
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||||
{
|
{
|
||||||
|
if (!internal::is_code_point_valid(cp))
|
||||||
|
throw invalid_code_point(cp);
|
||||||
|
|
||||||
if (cp < 0x80) // one octet
|
if (cp < 0x80) // one octet
|
||||||
*(result++) = static_cast<uint8_t>(cp);
|
*(result++) = static_cast<uint8_t>(cp);
|
||||||
else if (cp < 0x800) { // two octets
|
else if (cp < 0x800) { // two octets
|
||||||
if (!internal::is_code_point_valid(cp))
|
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||||
throw invalid_code_point(cp);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
}
|
||||||
else if (cp < 0x10000) { // three octets
|
else if (cp < 0x10000) { // three octets
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
}
|
}
|
||||||
else if (cp <= internal::CODE_POINT_MAX) { // four octets
|
else if (cp <= internal::CODE_POINT_MAX) { // four octets
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
throw invalid_code_point(cp);
|
throw invalid_code_point(cp);
|
||||||
|
@ -396,19 +396,19 @@ namespace internal
|
||||||
if (cp < 0x80) // one octet
|
if (cp < 0x80) // one octet
|
||||||
*(result++) = static_cast<uint8_t>(cp);
|
*(result++) = static_cast<uint8_t>(cp);
|
||||||
else if (cp < 0x800) { // two octets
|
else if (cp < 0x800) { // two octets
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
}
|
}
|
||||||
else if (cp < 0x10000) { // three octets
|
else if (cp < 0x10000) { // three octets
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
}
|
}
|
||||||
else { // four octets
|
else { // four octets
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#include "../../../source/utf8.h"
|
||||||
|
using namespace utf8;
|
||||||
|
|
||||||
|
// [ 1531740 ] utf8::append does not work correctly for some code points.
|
||||||
|
void id_1531740()
|
||||||
|
{
|
||||||
|
unsigned cp_u3044 = 0x3044U;
|
||||||
|
unsigned char u3044[] = {0x0, 0x0, 0x0, 0x0};
|
||||||
|
append(cp_u3044, u3044);
|
||||||
|
check (u3044[0] == 0xe3 && u3044[1] == 0x81 && u3044[2] == 0x84 && u3044[3] == 0);
|
||||||
|
}
|
|
@ -10,15 +10,26 @@ inline void check_impl (bool condition, const char* file, int line)
|
||||||
|
|
||||||
#define check(c) check_impl(c, __FILE__, __LINE__);
|
#define check(c) check_impl(c, __FILE__, __LINE__);
|
||||||
|
|
||||||
|
// Release 1.0 Beta 1
|
||||||
#include "r1_0Beta1/invalidutf8.h"
|
#include "r1_0Beta1/invalidutf8.h"
|
||||||
#include "r1_0Beta1/basic_functionality.h"
|
#include "r1_0Beta1/basic_functionality.h"
|
||||||
|
|
||||||
|
// Release 1.0 Beta 2
|
||||||
|
#include "r1_0Beta2/basic_functionality.h"
|
||||||
|
|
||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
// Release 1.0 Beta 1
|
||||||
//r1_0Beta1/invalidutf8.h
|
//r1_0Beta1/invalidutf8.h
|
||||||
id_1524459();
|
id_1524459();
|
||||||
id_1525236();
|
id_1525236();
|
||||||
id_1528369();
|
id_1528369();
|
||||||
//r1_0Beta1/basic_functionality.h
|
//r1_0Beta1/basic_functionality.h
|
||||||
id_1528544();
|
id_1528544();
|
||||||
|
|
||||||
|
// Release 1.0 Beta 2
|
||||||
|
//r1_0Beta2/basic_functionality.h
|
||||||
|
id_1531740();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,9 @@ int main()
|
||||||
end = append(0x65e5, u);
|
end = append(0x65e5, u);
|
||||||
assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
|
assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
|
||||||
|
|
||||||
|
end = append(0x3044, u);
|
||||||
|
assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
|
||||||
|
|
||||||
end = append(0x10346, u);
|
end = append(0x10346, u);
|
||||||
assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
|
assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,6 @@ int main(int argc, char** argv)
|
||||||
cout << "Could not open utf16.txt" << endl;
|
cout << "Could not open utf16.txt" << endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
const unsigned short utf16_bom = 0xfeff;
|
|
||||||
fs16.write(reinterpret_cast<const char*>(&utf16_bom), sizeof(unsigned short));
|
|
||||||
|
|
||||||
// Read it line by line
|
// Read it line by line
|
||||||
unsigned int line_count = 0;
|
unsigned int line_count = 0;
|
||||||
|
@ -53,6 +51,96 @@ int main(int argc, char** argv)
|
||||||
utf8to16(line_start, line_end, back_inserter(utf16_line));
|
utf8to16(line_start, line_end, back_inserter(utf16_line));
|
||||||
utf16_line.push_back('\n');
|
utf16_line.push_back('\n');
|
||||||
fs16.write(reinterpret_cast<const char*>(&utf16_line[0]), utf16_line.size() * sizeof (unsigned short));
|
fs16.write(reinterpret_cast<const char*>(&utf16_line[0]), utf16_line.size() * sizeof (unsigned short));
|
||||||
|
utf16_line.pop_back(); // get rid of '\n'
|
||||||
|
|
||||||
|
// Back to utf-8 and compare it to the original line.
|
||||||
|
string back_to_utf8;
|
||||||
|
utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8));
|
||||||
|
if (back_to_utf8.compare(string(line_start, line_end)) != 0)
|
||||||
|
cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n';
|
||||||
|
|
||||||
|
// Now, convert it to utf-32, back to utf-8 and compare
|
||||||
|
vector <unsigned> utf32_line;
|
||||||
|
utf8to32(line_start, line_end, back_inserter(utf32_line));
|
||||||
|
back_to_utf8.clear();
|
||||||
|
utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
|
||||||
|
if (back_to_utf8.compare(string(line_start, line_end)) != 0)
|
||||||
|
cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n';
|
||||||
|
|
||||||
|
// Now, iterate and back
|
||||||
|
unsigned char_count = 0;
|
||||||
|
string::iterator it = line_start;
|
||||||
|
while (it != line_end) {
|
||||||
|
unsigned cp = next(it, line_end);
|
||||||
|
char_count++;
|
||||||
|
}
|
||||||
|
if (char_count != utf32_line.size())
|
||||||
|
cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n';
|
||||||
|
|
||||||
|
string::iterator adv_it = line_start;
|
||||||
|
utf8::advance(adv_it, char_count, line_end);
|
||||||
|
if (adv_it != line_end)
|
||||||
|
cout << "Line " << line_count << ": Error in advance function" << '\n';
|
||||||
|
|
||||||
|
if (utf8::distance(line_start, line_end) != char_count)
|
||||||
|
cout << "Line " << line_count << ": Error in distance function" << '\n';
|
||||||
|
|
||||||
|
while (it != line_start) {
|
||||||
|
unsigned cp = previous(it, line.rend().base());
|
||||||
|
char_count--;
|
||||||
|
}
|
||||||
|
if (char_count != 0)
|
||||||
|
cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
|
||||||
|
|
||||||
|
//======================== Now, the unchecked versions ======================
|
||||||
|
// Convert it to utf-16 and compare to the checked version
|
||||||
|
vector<unsigned short> utf16_line_unchecked;
|
||||||
|
unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked));
|
||||||
|
|
||||||
|
if (utf16_line != utf16_line_unchecked)
|
||||||
|
cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n';
|
||||||
|
|
||||||
|
// Back to utf-8 and compare it to the original line.
|
||||||
|
back_to_utf8.clear();
|
||||||
|
unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8));
|
||||||
|
if (back_to_utf8.compare(string(line_start, line_end)) != 0)
|
||||||
|
cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n';
|
||||||
|
|
||||||
|
// Now, convert it to utf-32, back to utf-8 and compare
|
||||||
|
vector <unsigned> utf32_line_unchecked;
|
||||||
|
unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked));
|
||||||
|
if (utf32_line != utf32_line_unchecked)
|
||||||
|
cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n';
|
||||||
|
|
||||||
|
back_to_utf8.clear();
|
||||||
|
unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
|
||||||
|
if (back_to_utf8.compare(string(line_start, line_end)) != 0)
|
||||||
|
cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n';
|
||||||
|
|
||||||
|
// Now, iterate and back
|
||||||
|
char_count = 0;
|
||||||
|
it = line_start;
|
||||||
|
while (it != line_end) {
|
||||||
|
unsigned cp = unchecked::next(it);
|
||||||
|
char_count++;
|
||||||
|
}
|
||||||
|
if (char_count != utf32_line.size())
|
||||||
|
cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n';
|
||||||
|
|
||||||
|
adv_it = line_start;
|
||||||
|
utf8::unchecked::advance(adv_it, char_count);
|
||||||
|
if (adv_it != line_end)
|
||||||
|
cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n';
|
||||||
|
|
||||||
|
if (utf8::unchecked::distance(line_start, line_end) != char_count)
|
||||||
|
cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n';
|
||||||
|
|
||||||
|
while (it != line_start) {
|
||||||
|
unsigned cp = unchecked::previous(it);
|
||||||
|
char_count--;
|
||||||
|
}
|
||||||
|
if (char_count != 0)
|
||||||
|
cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue