diff --git a/v2_0/doc/utf8cpp.html b/v2_0/doc/utf8cpp.html index 18a7c03..63e9afd 100644 --- a/v2_0/doc/utf8cpp.html +++ b/v2_0/doc/utf8cpp.html @@ -33,6 +33,10 @@ ul.toc { list-style-type: none; } + p.version { + font-size: small; + font-style: italic; + } --> @@ -56,6 +60,20 @@
  • Reference +
  • Points of Interest @@ -64,7 +82,7 @@ Conclusion
  • - References + Links
  • @@ -182,12 +200,15 @@

    Reference

    -

    +

    Functions From utf8 Namespace

    utf8::append

    +

    + Available in version 1.0 and later. +

    Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. @@ -236,6 +257,9 @@ assert (u[0] == utf8::next +

    + Available in version 1.0 and later. +

    Given the iterator to the beginning of the UTF-8 sequence, it returns the code point and moves the iterator to the next position. @@ -277,6 +301,9 @@ assert (w == twochars + 3);

    utf8::prior

    +

    + Available in version 1.02 and later. +

    Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded @@ -330,8 +357,11 @@ assert (w == twochars); exception is thrown.

    - utf8::previous (deprecated, see utf8::prior) + utf8::previous

    +

    + Deprecated in version 1.02 and later. +

    Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded @@ -389,6 +419,9 @@ assert (w == twochars);

    utf8::advance

    +

    + Available in version 1.0 and later. +

    Advances an iterator by the specified number of code points within an UTF-8 sequence. @@ -431,6 +464,9 @@ assert (w == twochars + 5);

    utf8::distance

    +

    + Available in version 1.0 and later. +

    Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. @@ -474,6 +510,9 @@ assert (dist == 2);

    utf8::utf16to8

    +

    + Available in version 1.0 and later. +

    Converts a UTF-16 encoded string to UTF-8.

    @@ -514,6 +553,9 @@ assert (utf8result.size() == 10);

    utf8::utf8to16

    +

    + Available in version 1.0 and later. +

    Converts an UTF-8 encoded string to UTF-16

    @@ -555,6 +597,9 @@ assert (utf16result[3] == utf8::utf32to8 +

    + Available in version 1.0 and later. +

    Converts a UTF-32 encoded string to UTF-8.

    @@ -593,6 +638,9 @@ assert (utf8result.size() == 9);

    utf8::utf8to32

    +

    + Available in version 1.0 and later. +

    Converts a UTF-8 encoded string to UTF-32.

    @@ -632,6 +680,9 @@ assert (utf32result.size() == 2);

    utf8::find_invalid

    +

    + Available in version 1.0 and later. +

    Detects an invalid sequence within a UTF-8 string.

    @@ -668,6 +719,9 @@ assert (invalid == utf_invalid + 5);

    utf8::is_valid

    +

    + Available in version 1.0 and later. +

    Checks whether a sequence of octets is a valid UTF-8 string.

    @@ -701,6 +755,9 @@ assert (bvalid == false);

    utf8::replace_invalid

    +

    + Available in version 2.0 and later. +

    Replaces all invalid UTF-8 sequences within a string with a replacement marker.

    @@ -755,6 +812,9 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),

    utf8::is_bom

    +

    + Available in version 1.0 and later. +

    Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)

    @@ -783,12 +843,15 @@ assert (bbom == true); they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

    -

    +

    Types From utf8 Namespace

    utf8::iterator

    +

    + Available in version 2.0 and later. +

    Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets. @@ -862,12 +925,15 @@ assert (*it == 0x10346); std::string s = "example"; utf8::iterator i (s.begin(), s.begin(), s.end()); -

    +

    Functions From utf8::unchecked Namespace

    utf8::unchecked::append

    +

    + Available in version 1.0 and later. +

    Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. @@ -910,6 +976,9 @@ assert (u[0] == utf8::unchecked::next +

    + Available in version 1.0 and later. +

    Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position. @@ -945,6 +1014,9 @@ assert (w == twochars + 3);

    utf8::unchecked::prior

    +

    + Available in version 1.02 and later. +

    Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded @@ -981,6 +1053,9 @@ assert (w == twochars);

    utf8::unchecked::previous (deprecated, see utf8::unchecked::prior)

    +

    + Deprecated in version 1.02 and later. +

    Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded @@ -1023,6 +1098,9 @@ assert (w == twochars);

    utf8::unchecked::advance

    +

    + Available in version 1.0 and later. +

    Advances an iterator by the specified number of code points within an UTF-8 sequence. @@ -1061,6 +1139,9 @@ assert (w == twochars + 5);

    utf8::unchecked::distance

    +

    + Available in version 1.0 and later. +

    Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. @@ -1096,6 +1177,9 @@ assert (dist == 2);

    utf8::unchecked::utf16to8

    +

    + Available in version 1.0 and later. +

    Converts a UTF-16 encoded string to UTF-8.

    @@ -1136,6 +1220,9 @@ assert (utf8result.size() == 10);

    utf8::unchecked::utf8to16

    +

    + Available in version 1.0 and later. +

    Converts an UTF-8 encoded string to UTF-16

    @@ -1176,6 +1263,9 @@ assert (utf16result[3] == utf8::unchecked::utf32to8 +

    + Available in version 1.0 and later. +

    Converts a UTF-32 encoded string to UTF-8.

    @@ -1215,6 +1305,9 @@ assert (utf8result.size() == 9);

    utf8::unchecked::utf8to32

    +

    + Available in version 1.0 and later. +

    Converts a UTF-8 encoded string to UTF-32.

    @@ -1249,12 +1342,15 @@ assert (utf32result.size() == 2); This is a faster but less safe version of utf8::utf8to32. It does not check for validity of the supplied UTF-8 sequence.

    -

    +

    Types From utf8::unchecked Namespace

    utf8::iterator

    +

    + Available in version 2.0 and later. +

    Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets. @@ -1380,8 +1476,8 @@ assert (*un_it == 0x10346); use other means to work with UTF-8 strings. Template functions I describe in this article may be a good step in this direction.

    -

    - References +

    1. diff --git a/v2_0/source/utf8/checked.h b/v2_0/source/utf8/checked.h index d79c74d..dc342ff 100644 --- a/v2_0/source/utf8/checked.h +++ b/v2_0/source/utf8/checked.h @@ -262,7 +262,7 @@ namespace utf8 it(octet_it), range_start(range_start), range_end(range_end) { if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); + throw std::out_of_range("Invalid utf-8 iterator position"); } // the default "big three" are OK octet_iterator base () const { return it; } @@ -273,7 +273,7 @@ namespace utf8 } bool operator == (const iterator& rhs) const { - if (range_start != rhs.range_start && range_end != rhs.range_end) + if (range_start != rhs.range_start || range_end != rhs.range_end) throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); return (it == rhs.it); } diff --git a/v2_0/test_drivers/negative/negative.cpp b/v2_0/test_drivers/negative/negative.cpp index 571e6c6..8c910d1 100644 --- a/v2_0/test_drivers/negative/negative.cpp +++ b/v2_0/test_drivers/negative/negative.cpp @@ -34,6 +34,12 @@ int main() const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count); if (u == INVALID_LINES_END) cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; + + // try fixing it: + string fixed_line; + replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); + if (!is_valid(fixed_line.begin(), fixed_line.end())) + cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n'; } } }