Removing bidirectional restrictions for the octet_iterator
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@87 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
dacd49dde9
commit
5748eeff08
4 changed files with 27 additions and 27 deletions
|
@ -104,7 +104,8 @@ namespace internal
|
|||
|
||||
template <typename octet_iterator>
|
||||
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
|
||||
{
|
||||
{
|
||||
octet_iterator original_it = it;
|
||||
uint32_t cp = mask8(*it);
|
||||
// Check the lead octet
|
||||
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
|
||||
|
@ -112,7 +113,7 @@ namespace internal
|
|||
|
||||
// "Shortcut" for ASCII characters
|
||||
if (length == 1) {
|
||||
if (end - it > 0) {
|
||||
if (std::distance(it, end) > 0) {
|
||||
if (code_point)
|
||||
*code_point = cp;
|
||||
++it;
|
||||
|
@ -136,7 +137,7 @@ namespace internal
|
|||
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
break;
|
||||
|
@ -147,12 +148,12 @@ namespace internal
|
|||
cp += (*it) & 0x3f;
|
||||
}
|
||||
else {
|
||||
std::advance(it, -2);
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
break;
|
||||
|
@ -165,17 +166,17 @@ namespace internal
|
|||
cp += (*it) & 0x3f;
|
||||
}
|
||||
else {
|
||||
std::advance(it, -3);
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
std::advance(it, -2);
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else {
|
||||
--it;
|
||||
it = original_it;
|
||||
return INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
break;
|
||||
|
@ -183,7 +184,7 @@ namespace internal
|
|||
// Is the code point valid?
|
||||
if (!is_code_point_valid(cp)) {
|
||||
for (octet_difference_type i = 0; i < length - 1; ++i)
|
||||
--it;
|
||||
it = original_it;
|
||||
return INVALID_CODE_POINT;
|
||||
}
|
||||
|
||||
|
@ -192,19 +193,19 @@ namespace internal
|
|||
|
||||
if (cp < 0x80) {
|
||||
if (length != 1) {
|
||||
std::advance(it, -(length-1));
|
||||
it = original_it;
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else if (cp < 0x800) {
|
||||
if (length != 2) {
|
||||
std::advance(it, -(length-1));
|
||||
it = original_it;
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
else if (cp < 0x10000) {
|
||||
if (length != 3) {
|
||||
std::advance(it, -(length-1));
|
||||
it = original_it;
|
||||
return OVERLONG_SEQUENCE;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,8 +7,8 @@ using namespace utf8;
|
|||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
const char* TEST_FILE_PATH = "../../test_data/negative/utf8_invalid.txt";
|
||||
const unsigned INVALID_LINES[] = { 75, 76, 82, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264, 268, 269};
|
||||
const char* TEST_FILE_PATH = "../../../test_data/negative/utf8_invalid.txt";
|
||||
const unsigned INVALID_LINES[] = { 75, 76, 82, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264, 268, 269};
|
||||
const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
|
||||
|
||||
int main()
|
||||
|
@ -25,14 +25,14 @@ int main()
|
|||
char byte;
|
||||
while (!fs8.eof()) {
|
||||
string line;
|
||||
while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
|
||||
while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
|
||||
line.push_back(byte);
|
||||
|
||||
line_count++;
|
||||
// Print out lines that contain invalid UTF-8
|
||||
if (!is_valid(line.begin(), line.end())) {
|
||||
const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count);
|
||||
if (u == INVALID_LINES_END)
|
||||
if (u == INVALID_LINES_END)
|
||||
cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
|
||||
|
||||
// try fixing it:
|
||||
|
|
|
@ -8,8 +8,8 @@ struct timer {
|
|||
using namespace std;
|
||||
end = clock();
|
||||
unsigned milliseconds = (end - start)*1000 / CLOCKS_PER_SEC;
|
||||
report << "Spent " << milliseconds << "ms here\n";
|
||||
}
|
||||
report << "Spent " << milliseconds << "ms here\n";
|
||||
}
|
||||
|
||||
std::clock_t start;
|
||||
std::clock_t end;
|
||||
|
@ -17,5 +17,5 @@ struct timer {
|
|||
|
||||
private:
|
||||
// just to surpress a VC++ 8.0 warning
|
||||
timer& operator = (const timer&) {};
|
||||
timer& operator = (const timer&);
|
||||
};
|
||||
|
|
|
@ -53,30 +53,29 @@ int main(int argc, char** argv)
|
|||
timer t(cout);
|
||||
utf8::unchecked::utf8to16(buf, buf + length, utf16buf);
|
||||
}
|
||||
|
||||
// the UTF-16 result will not be larger than this (I hope :) )
|
||||
wchar_t* utf16iconvbuf = new wchar_t[wlength];
|
||||
{
|
||||
memset (utf16iconvbuf, 0 , wlength * sizeof(wchar_t));
|
||||
// win32
|
||||
cout << "win32: ";
|
||||
|
||||
|
||||
{
|
||||
timer t(cout);
|
||||
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, length, utf16iconvbuf, int(wlength));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// just check the correctness while we are here:
|
||||
if (!equal(utf16buf, utf16buf + wlength, utf16iconvbuf))
|
||||
if (!equal(utf16buf, utf16buf + wlength, utf16iconvbuf))
|
||||
cout << "Different result!!!";
|
||||
|
||||
|
||||
// the other way around
|
||||
cout << "UTF16 to UTF-8\n";
|
||||
{
|
||||
//win32
|
||||
memset(buf, 0, length);
|
||||
memset(buf, 0, length);
|
||||
cout<< "win32: ";
|
||||
|
||||
{
|
||||
|
@ -92,14 +91,14 @@ int main(int argc, char** argv)
|
|||
timer t(cout);
|
||||
utf8::unchecked::utf16to8(utf16buf, utf16buf + wlength, buf);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
memset (buf, 0 , length);
|
||||
cout << "utf16to8: ";
|
||||
timer t(cout);
|
||||
utf8::utf16to8(utf16buf, utf16buf + wlength, buf);
|
||||
}
|
||||
|
||||
|
||||
delete [] buf;
|
||||
delete [] utf16buf;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue