From 3df044a66339a24ecaba654f0eab59836c7eda31 Mon Sep 17 00:00:00 2001
From: ntrifunovic
+ UTF8-CPP: UTF-8 with C++ in a Portable Way
+
@@ -77,8 +80,8 @@
solutions.
- In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set - of template functions. For anybody used to work with STL algorithms, they should be + In order to easily handle UTF-8 encoded Unicode strings, I have come up with a small + generic library. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the license at the beginning of the utf8.h file. If you run into bugs or performance issues, please let me know and I'll do my best to address them. @@ -105,6 +108,7 @@ #include <iostream> #include <string> #include <vector> +#include "utf8.h" using namespace std; int main() { @@ -779,6 +783,85 @@ assert (bbom == true); they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.
++ Adapts the underlying octet iterator to iterate over the sequence of code points, + rather than raw octets. +
++template <typename octet_iterator> +class iterator; ++ +
iterator();
octet_iterator
is
+ constructed with its default constructor.
+ explicit iterator (const octet_iterator& octet_it,
+ const octet_iterator& range_start,
+ const octet_iterator& range_end);
octet_iterator
with octet_it
+ and sets the range in which the iterator is considered valid.
+ octet_iterator base () const;
octet_iterator
.
+ uint32_t operator * () const;
octet_iterator
is pointing to and returns the code point.
+ bool operator == (const iterator& rhs)
+ const;
bool operator != (const iterator& rhs)
+ const;
iterator& operator ++ ();
iterator operator ++ (int);
iterator& operator -- ();
iterator operator -- (int);
+ Example of use: +
++char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +utf8::iterator<char*> it(threechars, threechars, threechars + 9); +utf8::iterator<char*> it2 = it; +assert (it2 == it); +assert (*it == 0x10346); +assert (*(++it) == 0x65e5); +assert ((*it++) == 0x65e5); +assert (*it == 0x0448); +assert (it != it2); +utf8::iterator<char*> endit (threechars + 9, threechars, threechars + 9); +assert (++it == endit); +assert (*(--it) == 0x0448); +assert ((*it--) == 0x0448); +assert (*it == 0x65e5); +assert (--it == utf8::iterator<char*>(threechars, threechars, threechars + 9)); +assert (*it == 0x10346); ++
+ The purpose of utf8::iterator
adapter is to enable easy iteration as well as the use of STL
+ algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of
+ utf8::next()
and utf8::prior()
functions.
+
+ Note that utf8::iterator
adapter is a checked iterator. It operates on the range specified in
+ the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators
+ require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically,
+ the range will be determined by sequence container functions begin
and end
, i.e.:
+
+std::string s = "example";
+utf8::iterator i (s.begin(), s.begin(), s.end());
+
utf8::utf8to32
. It does not
check for validity of the supplied UTF-8 sequence.
+ + Adapts the underlying octet iterator to iterate over the sequence of code points, + rather than raw octets. +
++template <typename octet_iterator> +class iterator; ++ +
iterator();
octet_iterator
is
+ constructed with its default constructor.
+ explicit iterator (const octet_iterator& octet_it);
+
octet_iterator
with octet_it
+ octet_iterator base () const;
octet_iterator
.
+ uint32_t operator * () const;
octet_iterator
is pointing to and returns the code point.
+ bool operator == (const iterator& rhs)
+ const;
bool operator != (const iterator& rhs)
+ const;
iterator& operator ++ ();
iterator operator ++ (int);
iterator& operator -- ();
iterator operator -- (int);
+ Example of use: +
++char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +utf8::unchecked::iterator<char*> un_it(threechars); +utf8::unchecked::iterator<char*> un_it2 = un_it; +assert (un_it2 == un_it); +assert (*un_it == 0x10346); +assert (*(++un_it) == 0x65e5); +assert ((*un_it++) == 0x65e5); +assert (*un_it == 0x0448); +assert (un_it != un_it2); +utf8::::unchecked::iterator<char*> un_endit (threechars + 9); +assert (++un_it == un_endit); +assert (*(--un_it) == 0x0448); +assert ((*un_it--) == 0x0448); +assert (*un_it == 0x65e5); +assert (--un_it == utf8::unchecked::iterator<char*>(threechars)); +assert (*un_it == 0x10346); ++
+ This is an unchecked version of utf8::iterator
. It is faster in many cases, but offers
+ no validity or range checks.
+