First check in for branch 3.x - playing with utf8::append

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@140 a809a056-fc17-0410-9590-b4f493f8b08e
2014-06-01 02:22:29 +02:00 · 2014-06-01 02:22:29 +02:00 · 740e7e75b7
commit 740e7e75b7
parent 8ea47e8799
3 changed files with 142 additions and 3 deletions
--- a/src/utf8.h
+++ b/src/utf8.h
@ -1,4 +1,4 @@
-// Copyright 2006 Nemanja Trifunovic
+// Copyright 2006-2013 Nemanja Trifunovic

 /*
 Permission is hereby granted, free of charge, to any person or organization
@ -28,7 +28,120 @@ DEALINGS IN THE SOFTWARE.
 #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
 #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731

-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
+// By default, utf8 cpp requires C++ Standard Library strings and exceptions
+// The following macros can be used to change the default behavior 
+
+// #define UTF_CPP_NO_STD_STRING
+// #define UTF_CPP_NO_EXCEPTIONS
+
+#ifndef UTF_CPP_NO_EXCEPTIONS
+#include <stdexcept>
+
+#ifndef UTF_CPP_NO_STD_STRING
+#include <string>
+#include <iterator>
+
+#endif // #ifndef UTF_CPP_NO_STD_STRING
+#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
+
+namespace utf8
+{
+
+// Error codes - used internally and if exceptions disabled
+enum class utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD,
+                     INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+#ifndef UTF_CPP_NO_EXCEPTIONS
+// Base for the exceptions that may be thrown from the library
+class exception : public ::std::exception {
+};
+// Exceptions that may be thrown from the library functions.
+
+class invalid_code_point : public exception {
+    char32_t cp;
+public:
+    invalid_code_point(char32_t cp) : cp(cp) {}
+    virtual const char* what() const noexcept { return "Invalid code point"; }
+    char32_t code_point() const {return cp;}
+};
+
+#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+    // Unicode constants
+    // Leading (high) surrogates: 0xd800 - 0xdbff
+    // Trailing (low) surrogates: 0xdc00 - 0xdfff
+    const char32_t LEAD_SURROGATE_MIN  = 0x0000d800;
+    const char32_t LEAD_SURROGATE_MAX  = 0x0000dbff;
+    const char32_t TRAIL_SURROGATE_MIN = 0x0000dc00;
+    const char32_t TRAIL_SURROGATE_MAX = 0x0000dfff;
+
+    // Maximum valid value for a Unicode code point
+    const char32_t CODE_POINT_MAX      = 0x0010ffff;
+
+
+    inline bool is_surrogate(char32_t cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
+    inline bool is_code_point_valid(char32_t cp)
+    {
+        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+    }
+} // namespace internal
+
+    /// The library API - functions intended to be called by the users
+
+    template <typename octet_iterator>
+    octet_iterator append(char32_t cp, octet_iterator result, utf_error& error)
+    {
+        if (!utf8::internal::is_code_point_valid(cp)) {
+            error = utf8::utf_error::INVALID_CODE_POINT;
+            return result;
+        }
+
+        if (cp < 0x80)                        // one octet
+            *(result++) = static_cast<char>(cp);
+        else if (cp < 0x800) {                // two octets
+            *(result++) = static_cast<char>((cp >> 6)            | 0xc0);
+            *(result++) = static_cast<char>((cp & 0x3f)          | 0x80);
+        }
+        else if (cp < 0x10000) {              // three octets
+            *(result++) = static_cast<char>((cp >> 12)           | 0xe0);
+            *(result++) = static_cast<char>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<char>((cp & 0x3f)          | 0x80);
+        }
+        else {                                // four octets
+            *(result++) = static_cast<char>((cp >> 18)           | 0xf0);
+            *(result++) = static_cast<char>(((cp >> 12) & 0x3f)  | 0x80);
+            *(result++) = static_cast<char>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<char>((cp & 0x3f)          | 0x80);
+        }
+        return result;
+    }
+
+#ifndef UTF_CPP_NO_EXCEPTIONS
+    template <typename octet_iterator>
+    octet_iterator append(char32_t cp, octet_iterator result)
+    {
+        utf8::utf_error err {utf8::utf_error::UTF8_OK};
+        utf8::append(cp, result, err);
+        if (err != utf8::utf_error::UTF8_OK)
+            throw utf8::invalid_code_point(cp);
+	return result;
+    }
+#ifndef UTF_CPP_NO_STD_STRING
+    inline void append(char32_t cp, std::string& str)
+    {
+        utf8::append(cp, std::back_inserter(str)); 
+    }
+#endif // #ifndef UTF_CPP_NO_STD_STRING
+#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
+
+} // namespace utf8

 #endif // header guard
+
--- a/tests/Makefile
+++ b/tests/Makefile
@ -0,0 +1,6 @@
+CC = g++
+CFLAGS = -g -Wall --std=c++11
+
+smoketest: unit.cpp ../src/utf8.h
+	$(CC) $(CFLAGS) unit.cpp -ounit
+	./unit
--- a/tests/unit.cpp
+++ b/tests/unit.cpp
@ -0,0 +1,20 @@
+#include <assert.h>
+#include "../src/utf8.h"
+using namespace std;
+
+int main()
+{
+// append
+    {
+	string s;
+	utf8::append(U'\U00000448', s);
+        assert (s.length() == 2 && s[0] == '\xd1' && s[1] == '\x88');
+
+	s.erase();
+	utf8::append(U'\U000065e5', s);
+        assert (s.length() == 3 && s[0] == '\xe6' && s[1] == '\x97' && s[2] == '\xa5');
+    }
+    
+}
+
+