From 3df044a66339a24ecaba654f0eab59836c7eda31 Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Mon, 18 Dec 2006 01:52:13 +0000 Subject: [PATCH] Added documentation for the iterator adapter git-svn-id: http://svn.code.sf.net/p/utfcpp/code@74 a809a056-fc17-0410-9590-b4f493f8b08e --- v2_0/doc/utf8cpp.html | 153 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 2 deletions(-) diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html index a177186..18a7c03 100644 --- a/v2_0/doc/utf8cpp.html +++ b/v2_0/doc/utf8cpp.html @@ -37,6 +37,9 @@ +

+ UTF8-CPP: UTF-8 with C++ in a Portable Way +

The Sourceforge project page

@@ -77,8 +80,8 @@ solutions.

- In order to easily handle UTF-8 encoded Unicode strings, I have come up with a set - of template functions. For anybody used to work with STL algorithms, they should be + In order to easily handle UTF-8 encoded Unicode strings, I have come up with a small + generic library. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the license at the beginning of the utf8.h file. If you run into bugs or performance issues, please let me know and I'll do my best to address them. @@ -105,6 +108,7 @@ #include <iostream> #include <string> #include <vector> +#include "utf8.h" using namespace std; int main() { @@ -779,6 +783,85 @@ assert (bbom == true); they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

+

+ Types From utf8 Namespace +

+

+ utf8::iterator +

+

+ Adapts the underlying octet iterator to iterate over the sequence of code points, + rather than raw octets. +

+
+template <typename octet_iterator>
+class iterator;
+
+ +
Member functions
+
+
iterator();
the deafult constructor; the underlying octet_iterator is + constructed with its default constructor. +
explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end);
a constructor + that initializes the underlying octet_iterator with octet_it + and sets the range in which the iterator is considered valid. +
octet_iterator base () const;
returns the + underlying octet_iterator. +
uint32_t operator * () const;
decodes the utf-8 sequence + the underlying octet_iterator is pointing to and returns the code point. +
bool operator == (const iterator& rhs) + const;
returns true + if the two underlaying iterators are equal. +
bool operator != (const iterator& rhs) + const;
returns true + if the two underlaying iterators are not equal. +
iterator& operator ++ ();
the prefix increment - moves + the iterator to the next UTF-8 encoded code point. +
iterator operator ++ (int);
+ the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one. +
iterator& operator -- ();
the prefix decrement - moves + the iterator to the previous UTF-8 encoded code point. +
iterator operator -- (int);
+ the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one. +
+

+ Example of use: +

+
+char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+utf8::iterator<char*> it(threechars, threechars, threechars + 9);
+utf8::iterator<char*> it2 = it;
+assert (it2 == it);
+assert (*it == 0x10346);
+assert (*(++it) == 0x65e5);
+assert ((*it++) == 0x65e5);
+assert (*it == 0x0448);
+assert (it != it2);
+utf8::iterator<char*> endit (threechars + 9, threechars, threechars + 9);  
+assert (++it == endit);
+assert (*(--it) == 0x0448);
+assert ((*it--) == 0x0448);
+assert (*it == 0x65e5);
+assert (--it == utf8::iterator<char*>(threechars, threechars, threechars + 9));
+assert (*it == 0x10346);
+
+

+ The purpose of utf8::iterator adapter is to enable easy iteration as well as the use of STL + algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of + utf8::next() and utf8::prior() functions. +

+

+ Note that utf8::iterator adapter is a checked iterator. It operates on the range specified in + the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators + require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically, + the range will be determined by sequence container functions begin and end, i.e.: +

+
+std::string s = "example";
+utf8::iterator i (s.begin(), s.begin(), s.end());
+

Functions From utf8::unchecked Namespace

@@ -1166,6 +1249,72 @@ assert (utf32result.size() == 2); This is a faster but less safe version of utf8::utf8to32. It does not check for validity of the supplied UTF-8 sequence.

+

+ Types From utf8::unchecked Namespace +

+

+ utf8::iterator +

+

+ Adapts the underlying octet iterator to iterate over the sequence of code points, + rather than raw octets. +

+
+template <typename octet_iterator>
+class iterator;
+
+ +
Member functions
+
+
iterator();
the deafult constructor; the underlying octet_iterator is + constructed with its default constructor. +
explicit iterator (const octet_iterator& octet_it); +
a constructor + that initializes the underlying octet_iterator with octet_it +
octet_iterator base () const;
returns the + underlying octet_iterator. +
uint32_t operator * () const;
decodes the utf-8 sequence + the underlying octet_iterator is pointing to and returns the code point. +
bool operator == (const iterator& rhs) + const;
returns true + if the two underlaying iterators are equal. +
bool operator != (const iterator& rhs) + const;
returns true + if the two underlaying iterators are not equal. +
iterator& operator ++ ();
the prefix increment - moves + the iterator to the next UTF-8 encoded code point. +
iterator operator ++ (int);
+ the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one. +
iterator& operator -- ();
the prefix decrement - moves + the iterator to the previous UTF-8 encoded code point. +
iterator operator -- (int);
+ the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one. +
+

+ Example of use: +

+
+char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+utf8::unchecked::iterator<char*> un_it(threechars);
+utf8::unchecked::iterator<char*> un_it2 = un_it;
+assert (un_it2 == un_it);
+assert (*un_it == 0x10346);
+assert (*(++un_it) == 0x65e5);
+assert ((*un_it++) == 0x65e5);
+assert (*un_it == 0x0448);
+assert (un_it != un_it2);
+utf8::::unchecked::iterator<char*> un_endit (threechars + 9);  
+assert (++un_it == un_endit);
+assert (*(--un_it) == 0x0448);
+assert ((*un_it--) == 0x0448);
+assert (*un_it == 0x65e5);
+assert (--un_it == utf8::unchecked::iterator<char*>(threechars));
+assert (*un_it == 0x10346);
+
+

+ This is an unchecked version of utf8::iterator. It is faster in many cases, but offers + no validity or range checks. +

Points of interest