First check in for branch 3.x - playing with utf8::append

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@140 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
King_DuckZ 2014-06-01 02:22:29 +02:00
parent 8ea47e8799
commit 740e7e75b7
3 changed files with 142 additions and 3 deletions

View file

@ -1,4 +1,4 @@
// Copyright 2006 Nemanja Trifunovic
// Copyright 2006-2013 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
@ -28,7 +28,120 @@ DEALINGS IN THE SOFTWARE.
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
// By default, utf8 cpp requires C++ Standard Library strings and exceptions
// The following macros can be used to change the default behavior
// #define UTF_CPP_NO_STD_STRING
// #define UTF_CPP_NO_EXCEPTIONS
#ifndef UTF_CPP_NO_EXCEPTIONS
#include <stdexcept>
#ifndef UTF_CPP_NO_STD_STRING
#include <string>
#include <iterator>
#endif // #ifndef UTF_CPP_NO_STD_STRING
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
namespace utf8
{
// Error codes - used internally and if exceptions disabled
enum class utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD,
INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
#ifndef UTF_CPP_NO_EXCEPTIONS
// Base for the exceptions that may be thrown from the library
class exception : public ::std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public exception {
char32_t cp;
public:
invalid_code_point(char32_t cp) : cp(cp) {}
virtual const char* what() const noexcept { return "Invalid code point"; }
char32_t code_point() const {return cp;}
};
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const char32_t LEAD_SURROGATE_MIN = 0x0000d800;
const char32_t LEAD_SURROGATE_MAX = 0x0000dbff;
const char32_t TRAIL_SURROGATE_MIN = 0x0000dc00;
const char32_t TRAIL_SURROGATE_MAX = 0x0000dfff;
// Maximum valid value for a Unicode code point
const char32_t CODE_POINT_MAX = 0x0010ffff;
inline bool is_surrogate(char32_t cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
inline bool is_code_point_valid(char32_t cp)
{
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
} // namespace internal
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(char32_t cp, octet_iterator result, utf_error& error)
{
if (!utf8::internal::is_code_point_valid(cp)) {
error = utf8::utf_error::INVALID_CODE_POINT;
return result;
}
if (cp < 0x80) // one octet
*(result++) = static_cast<char>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<char>((cp >> 6) | 0xc0);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<char>((cp >> 12) | 0xe0);
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<char>((cp >> 18) | 0xf0);
*(result++) = static_cast<char>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
return result;
}
#ifndef UTF_CPP_NO_EXCEPTIONS
template <typename octet_iterator>
octet_iterator append(char32_t cp, octet_iterator result)
{
utf8::utf_error err {utf8::utf_error::UTF8_OK};
utf8::append(cp, result, err);
if (err != utf8::utf_error::UTF8_OK)
throw utf8::invalid_code_point(cp);
return result;
}
#ifndef UTF_CPP_NO_STD_STRING
inline void append(char32_t cp, std::string& str)
{
utf8::append(cp, std::back_inserter(str));
}
#endif // #ifndef UTF_CPP_NO_STD_STRING
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
} // namespace utf8
#endif // header guard

6
tests/Makefile Normal file
View file

@ -0,0 +1,6 @@
CC = g++
CFLAGS = -g -Wall --std=c++11
smoketest: unit.cpp ../src/utf8.h
$(CC) $(CFLAGS) unit.cpp -ounit
./unit

20
tests/unit.cpp Normal file
View file

@ -0,0 +1,20 @@
#include <assert.h>
#include "../src/utf8.h"
using namespace std;
int main()
{
// append
{
string s;
utf8::append(U'\U00000448', s);
assert (s.length() == 2 && s[0] == '\xd1' && s[1] == '\x88');
s.erase();
utf8::append(U'\U000065e5', s);
assert (s.length() == 3 && s[0] == '\xe6' && s[1] == '\x97' && s[2] == '\xa5');
}
}