diff --git a/v3_0/src/utf8.h b/v3_0/src/utf8.h new file mode 100644 index 0000000..6749c57 --- /dev/null +++ b/v3_0/src/utf8.h @@ -0,0 +1,147 @@ +// Copyright 2006-2013 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +// By default, utf8 cpp requires C++ Standard Library strings and exceptions +// The following macros can be used to change the default behavior + +// #define UTF_CPP_NO_STD_STRING +// #define UTF_CPP_NO_EXCEPTIONS + +#ifndef UTF_CPP_NO_EXCEPTIONS +#include + +#ifndef UTF_CPP_NO_STD_STRING +#include +#include + +#endif // #ifndef UTF_CPP_NO_STD_STRING +#endif // #ifndef UTF_CPP_NO_EXCEPTIONS + +namespace utf8 +{ + +// Error codes - used internally and if exceptions disabled +enum class utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, + INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + +#ifndef UTF_CPP_NO_EXCEPTIONS +// Base for the exceptions that may be thrown from the library +class exception : public ::std::exception { +}; +// Exceptions that may be thrown from the library functions. + +class invalid_code_point : public exception { + char32_t cp; +public: + invalid_code_point(char32_t cp) : cp(cp) {} + virtual const char* what() const noexcept { return "Invalid code point"; } + char32_t code_point() const {return cp;} +}; + +#endif // #ifndef UTF_CPP_NO_EXCEPTIONS + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const char32_t LEAD_SURROGATE_MIN = 0x0000d800; + const char32_t LEAD_SURROGATE_MAX = 0x0000dbff; + const char32_t TRAIL_SURROGATE_MIN = 0x0000dc00; + const char32_t TRAIL_SURROGATE_MAX = 0x0000dfff; + + // Maximum valid value for a Unicode code point + const char32_t CODE_POINT_MAX = 0x0010ffff; + + + inline bool is_surrogate(char32_t cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + inline bool is_code_point_valid(char32_t cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } +} // namespace internal + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(char32_t cp, octet_iterator result, utf_error& error) + { + if (!utf8::internal::is_code_point_valid(cp)) { + error = utf8::utf_error::INVALID_CODE_POINT; + return result; + } + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + +#ifndef UTF_CPP_NO_EXCEPTIONS + template + octet_iterator append(char32_t cp, octet_iterator result) + { + utf8::utf_error err {utf8::utf_error::UTF8_OK}; + utf8::append(cp, result, err); + if (err != utf8::utf_error::UTF8_OK) + throw utf8::invalid_code_point(cp); + return result; + } +#ifndef UTF_CPP_NO_STD_STRING + inline void append(char32_t cp, std::string& str) + { + utf8::append(cp, std::back_inserter(str)); + } +#endif // #ifndef UTF_CPP_NO_STD_STRING +#endif // #ifndef UTF_CPP_NO_EXCEPTIONS + +} // namespace utf8 + +#endif // header guard + diff --git a/v3_0/tests/Makefile b/v3_0/tests/Makefile new file mode 100644 index 0000000..ab48a9b --- /dev/null +++ b/v3_0/tests/Makefile @@ -0,0 +1,6 @@ +CC = g++ +CFLAGS = -g -Wall --std=c++11 + +smoketest: unit.cpp ../src/utf8.h + $(CC) $(CFLAGS) unit.cpp -ounit + ./unit diff --git a/v3_0/tests/unit.cpp b/v3_0/tests/unit.cpp new file mode 100644 index 0000000..5bbfaf8 --- /dev/null +++ b/v3_0/tests/unit.cpp @@ -0,0 +1,20 @@ +#include +#include "../src/utf8.h" +using namespace std; + +int main() +{ +// append + { + string s; + utf8::append(U'\U00000448', s); + assert (s.length() == 2 && s[0] == '\xd1' && s[1] == '\x88'); + + s.erase(); + utf8::append(U'\U000065e5', s); + assert (s.length() == 3 && s[0] == '\xe6' && s[1] == '\x97' && s[2] == '\xa5'); + } + +} + +