From 4d7ad9b625424468e33ad1c9a74dacbbeab79d14 Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Sat, 27 Oct 2007 23:34:59 +0000 Subject: [PATCH] Updated the documentation and a test to include peek_next() git-svn-id: http://svn.code.sf.net/p/utfcpp/code@83 a809a056-fc17-0410-9590-b4f493f8b08e --- doc/utf8cpp.html | 76 ++++++++++++++++++++++++++ test_drivers/utf8reader/utf8reader.cpp | 8 ++- 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index 63e9afd..4ad7e10 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -294,6 +294,46 @@ assert (w == twochars + 3);

This function is typically used to iterate through a UTF-8 encoded string.

+

+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is + thrown. +

+

+ utf8::peek_next +

+

+ Available in version 2.1 and later. +

+

+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code + point for the following sequence without changing the value of the iterator. +

+
+template <typename octet_iterator> 
+uint32_t peek_next(octet_iterator it, octet_iterator end);
+   
+
+

+ it: an iterator pointing to the beginning of an UTF-8 + encoded code point.
+ end: end of the UTF-8 sequence to be processed. If it + gets equal to end during the extraction of a code point, an + utf8::not_enough_room exception is thrown.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = peek_next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars);
+

In case of an invalid UTF-8 seqence, a utf8::invalid_utf8 exception is thrown. @@ -1011,6 +1051,42 @@ assert (w == twochars + 3); This is a faster but less safe version of utf8::next. It does not check for validity of the supplied UTF-8 sequence.

+

+ utf8::unchecked::peek_next +

+

+ Available in version 2.1 and later. +

+

+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point. +

+
+template <typename octet_iterator>
+uint32_t peek_next(octet_iterator it);
+   
+
+

+ it: an iterator pointing to the beginning of an UTF-8 + encoded code point.
+ Return value: the 32 bit representation of the + processed UTF-8 code point. +

+

+ Example of use: +

+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::peek_next(w);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+

+ This is a faster but less safe version of utf8::peek_next. It does not + check for validity of the supplied UTF-8 sequence. +

utf8::unchecked::prior

diff --git a/test_drivers/utf8reader/utf8reader.cpp b/test_drivers/utf8reader/utf8reader.cpp index ca85286..c88a5ee 100644 --- a/test_drivers/utf8reader/utf8reader.cpp +++ b/test_drivers/utf8reader/utf8reader.cpp @@ -59,7 +59,9 @@ int main(int argc, char** argv) unsigned char_count = 0; string::iterator it = line_start; while (it != line_end) { - next(it, line_end); + unsigned int next_cp = peek_next(it, line_end); + if (next(it, line_end) != next_cp) + cout << "Line " << line_count << ": Error: peek_next gave a different result than next" << '\n'; char_count++; } if (char_count != utf32_line.size()) @@ -121,7 +123,9 @@ int main(int argc, char** argv) char_count = 0; it = line_start; while (it != line_end) { - unchecked::next(it); + unsigned int next_cp = unchecked::peek_next(it); + if (unchecked::next(it) != next_cp) + cout << "Line " << line_count << ": Error: unchecked::peek_next gave a different result than unchecked::next" << '\n';; char_count++; } if (char_count != utf32_line.size())