Compare commits
3 commits
Author | SHA1 | Date | |
---|---|---|---|
|
b65aeffc11 | ||
740e7e75b7 | |||
8ea47e8799 |
7 changed files with 172 additions and 34 deletions
|
@ -1,34 +0,0 @@
|
||||||
// Copyright 2006 Nemanja Trifunovic
|
|
||||||
|
|
||||||
/*
|
|
||||||
Permission is hereby granted, free of charge, to any person or organization
|
|
||||||
obtaining a copy of the software and accompanying documentation covered by
|
|
||||||
this license (the "Software") to use, reproduce, display, distribute,
|
|
||||||
execute, and transmit the Software, and to prepare derivative works of the
|
|
||||||
Software, and to permit third-parties to whom the Software is furnished to
|
|
||||||
do so, all subject to the following:
|
|
||||||
|
|
||||||
The copyright notices in the Software and this entire statement, including
|
|
||||||
the above license grant, this restriction and the following disclaimer,
|
|
||||||
must be included in all copies of the Software, in whole or in part, and
|
|
||||||
all derivative works of the Software, unless such copies or derivative
|
|
||||||
works are solely in the form of machine-executable object code generated by
|
|
||||||
a source language processor.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
|
||||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
|
||||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
|
||||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
||||||
DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
|
||||||
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
|
||||||
|
|
||||||
#include "utf8/checked.h"
|
|
||||||
#include "utf8/unchecked.h"
|
|
||||||
|
|
||||||
#endif // header guard
|
|
147
src/utf8.h
Normal file
147
src/utf8.h
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
// Copyright 2006-2013 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||||
|
|
||||||
|
// By default, utf8 cpp requires C++ Standard Library strings and exceptions
|
||||||
|
// The following macros can be used to change the default behavior
|
||||||
|
|
||||||
|
// #define UTF_CPP_NO_STD_STRING
|
||||||
|
// #define UTF_CPP_NO_EXCEPTIONS
|
||||||
|
|
||||||
|
#ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#ifndef UTF_CPP_NO_STD_STRING
|
||||||
|
#include <string>
|
||||||
|
#include <iterator>
|
||||||
|
|
||||||
|
#endif // #ifndef UTF_CPP_NO_STD_STRING
|
||||||
|
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
|
||||||
|
// Error codes - used internally and if exceptions disabled
|
||||||
|
enum class utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD,
|
||||||
|
INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||||
|
|
||||||
|
#ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
// Base for the exceptions that may be thrown from the library
|
||||||
|
class exception : public ::std::exception {
|
||||||
|
};
|
||||||
|
// Exceptions that may be thrown from the library functions.
|
||||||
|
|
||||||
|
class invalid_code_point : public exception {
|
||||||
|
char32_t cp;
|
||||||
|
public:
|
||||||
|
invalid_code_point(char32_t cp) : cp(cp) {}
|
||||||
|
virtual const char* what() const noexcept { return "Invalid code point"; }
|
||||||
|
char32_t code_point() const {return cp;}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
|
||||||
|
// Helper code - not intended to be directly called by the library users. May be changed at any time
|
||||||
|
namespace internal
|
||||||
|
{
|
||||||
|
// Unicode constants
|
||||||
|
// Leading (high) surrogates: 0xd800 - 0xdbff
|
||||||
|
// Trailing (low) surrogates: 0xdc00 - 0xdfff
|
||||||
|
const char32_t LEAD_SURROGATE_MIN = 0x0000d800;
|
||||||
|
const char32_t LEAD_SURROGATE_MAX = 0x0000dbff;
|
||||||
|
const char32_t TRAIL_SURROGATE_MIN = 0x0000dc00;
|
||||||
|
const char32_t TRAIL_SURROGATE_MAX = 0x0000dfff;
|
||||||
|
|
||||||
|
// Maximum valid value for a Unicode code point
|
||||||
|
const char32_t CODE_POINT_MAX = 0x0010ffff;
|
||||||
|
|
||||||
|
|
||||||
|
inline bool is_surrogate(char32_t cp)
|
||||||
|
{
|
||||||
|
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool is_code_point_valid(char32_t cp)
|
||||||
|
{
|
||||||
|
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
|
||||||
|
}
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
/// The library API - functions intended to be called by the users
|
||||||
|
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator append(char32_t cp, octet_iterator result, utf_error& error)
|
||||||
|
{
|
||||||
|
if (!utf8::internal::is_code_point_valid(cp)) {
|
||||||
|
error = utf8::utf_error::INVALID_CODE_POINT;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cp < 0x80) // one octet
|
||||||
|
*(result++) = static_cast<char>(cp);
|
||||||
|
else if (cp < 0x800) { // two octets
|
||||||
|
*(result++) = static_cast<char>((cp >> 6) | 0xc0);
|
||||||
|
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else if (cp < 0x10000) { // three octets
|
||||||
|
*(result++) = static_cast<char>((cp >> 12) | 0xe0);
|
||||||
|
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else { // four octets
|
||||||
|
*(result++) = static_cast<char>((cp >> 18) | 0xf0);
|
||||||
|
*(result++) = static_cast<char>(((cp >> 12) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator append(char32_t cp, octet_iterator result)
|
||||||
|
{
|
||||||
|
utf8::utf_error err {utf8::utf_error::UTF8_OK};
|
||||||
|
utf8::append(cp, result, err);
|
||||||
|
if (err != utf8::utf_error::UTF8_OK)
|
||||||
|
throw utf8::invalid_code_point(cp);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#ifndef UTF_CPP_NO_STD_STRING
|
||||||
|
inline void append(char32_t cp, std::string& str)
|
||||||
|
{
|
||||||
|
utf8::append(cp, std::back_inserter(str));
|
||||||
|
}
|
||||||
|
#endif // #ifndef UTF_CPP_NO_STD_STRING
|
||||||
|
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
|
||||||
|
|
||||||
|
} // namespace utf8
|
||||||
|
|
||||||
|
#endif // header guard
|
||||||
|
|
6
tests/Makefile
Normal file
6
tests/Makefile
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
CC = g++
|
||||||
|
CFLAGS = -g -Wall --std=c++11
|
||||||
|
|
||||||
|
smoketest: unit.cpp ../src/utf8.h
|
||||||
|
$(CC) $(CFLAGS) unit.cpp -ounit -lboost_unit_test_framework
|
||||||
|
./unit
|
19
tests/unit.cpp
Normal file
19
tests/unit.cpp
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#define BOOST_TEST_DYN_LINK
|
||||||
|
#define BOOST_TEST_MODULE UTF8_CPP_UNIT
|
||||||
|
#include <boost/test/unit_test.hpp>
|
||||||
|
|
||||||
|
#include "../src/utf8.h"
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(append)
|
||||||
|
{
|
||||||
|
string s;
|
||||||
|
BOOST_CHECK_NO_THROW (utf8::append(U'\U00000448', s));
|
||||||
|
BOOST_CHECK (s.length() == 2 && s[0] == '\xd1' && s[1] == '\x88');
|
||||||
|
|
||||||
|
s.erase();
|
||||||
|
BOOST_CHECK_NO_THROW(utf8::append(U'\U000065e5', s));
|
||||||
|
BOOST_CHECK (s.length() == 3 && s[0] == '\xe6' && s[1] == '\x97' && s[2] == '\xa5');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue