Updated documentation. Fixed a small bug in checked.h. Added new checks to the negative tests

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@78 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2007-02-25 00:16:10 +00:00
parent cd3092c0ca
commit baf63b327a
3 changed files with 112 additions and 10 deletions

View file

@ -33,6 +33,10 @@
ul.toc { ul.toc {
list-style-type: none; list-style-type: none;
} }
p.version {
font-size: small;
font-style: italic;
}
--> -->
</style> </style>
</head> </head>
@ -56,6 +60,20 @@
</li> </li>
<li> <li>
<a href="#reference">Reference</a> <a href="#reference">Reference</a>
<ul class="toc">
<li>
<a href="#funutf8">Functions From utf8 Namespace </a>
</li>
<li>
<a href="#typesutf8">Types From utf8 Namespace </a>
</li>
<li>
<a href="#fununchecked">Functions From utf8::unchecked Namespace </a>
</li>
<li>
<a href="#typesunchecked">Types From utf8::unchecked Namespace </a>
</li>
</ul>
</li> </li>
<li> <li>
<a href="#points">Points of Interest</a> <a href="#points">Points of Interest</a>
@ -64,7 +82,7 @@
<a href="#conclusion">Conclusion</a> <a href="#conclusion">Conclusion</a>
</li> </li>
<li> <li>
<a href="#references">References</a> <a href="#links">Links</a>
</li> </li>
</ul> </ul>
</div> </div>
@ -182,12 +200,15 @@
<h2 id="reference"> <h2 id="reference">
Reference Reference
</h2> </h2>
<h3> <h3 id="funutf8">
Functions From utf8 Namespace Functions From utf8 Namespace
</h3> </h3>
<h4> <h4>
utf8::append utf8::append
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
to a UTF-8 string. to a UTF-8 string.
@ -236,6 +257,9 @@ assert (u[<span class="literal">0</span>] == <span class=
<h4> <h4>
utf8::next utf8::next
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Given the iterator to the beginning of the UTF-8 sequence, it returns the code Given the iterator to the beginning of the UTF-8 sequence, it returns the code
point and moves the iterator to the next position. point and moves the iterator to the next position.
@ -277,6 +301,9 @@ assert (w == twochars + <span class="literal">3</span>);
<h4> <h4>
utf8::prior utf8::prior
</h4> </h4>
<p class="version">
Available in version 1.02 and later.
</p>
<p> <p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded decreases the iterator until it hits the beginning of the previous UTF-8 encoded
@ -330,8 +357,11 @@ assert (w == twochars);
exception is thrown. exception is thrown.
</p> </p>
<h4> <h4>
utf8::previous (deprecated, see utf8::prior) utf8::previous
</h4> </h4>
<p class="version">
Deprecated in version 1.02 and later.
</p>
<p> <p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded decreases the iterator until it hits the beginning of the previous UTF-8 encoded
@ -389,6 +419,9 @@ assert (w == twochars);
<h4> <h4>
utf8::advance utf8::advance
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Advances an iterator by the specified number of code points within an UTF-8 Advances an iterator by the specified number of code points within an UTF-8
sequence. sequence.
@ -431,6 +464,9 @@ assert (w == twochars + <span class="literal">5</span>);
<h4> <h4>
utf8::distance utf8::distance
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Given the iterators to two UTF-8 encoded code points in a seqence, returns the Given the iterators to two UTF-8 encoded code points in a seqence, returns the
number of code points between them. number of code points between them.
@ -474,6 +510,9 @@ assert (dist == <span class="literal">2</span>);
<h4> <h4>
utf8::utf16to8 utf8::utf16to8
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-16 encoded string to UTF-8. Converts a UTF-16 encoded string to UTF-8.
</p> </p>
@ -514,6 +553,9 @@ assert (utf8result.size() == <span class="literal">10</span>);
<h4> <h4>
utf8::utf8to16 utf8::utf8to16
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts an UTF-8 encoded string to UTF-16 Converts an UTF-8 encoded string to UTF-16
</p> </p>
@ -555,6 +597,9 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
<h4> <h4>
utf8::utf32to8 utf8::utf32to8
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-32 encoded string to UTF-8. Converts a UTF-32 encoded string to UTF-8.
</p> </p>
@ -593,6 +638,9 @@ assert (utf8result.size() == <span class="literal">9</span>);
<h4> <h4>
utf8::utf8to32 utf8::utf8to32
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-8 encoded string to UTF-32. Converts a UTF-8 encoded string to UTF-32.
</p> </p>
@ -632,6 +680,9 @@ assert (utf32result.size() == <span class="literal">2</span>);
<h4> <h4>
utf8::find_invalid utf8::find_invalid
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Detects an invalid sequence within a UTF-8 string. Detects an invalid sequence within a UTF-8 string.
</p> </p>
@ -668,6 +719,9 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
<h4> <h4>
utf8::is_valid utf8::is_valid
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Checks whether a sequence of octets is a valid UTF-8 string. Checks whether a sequence of octets is a valid UTF-8 string.
</p> </p>
@ -701,6 +755,9 @@ assert (bvalid == false);
<h4> <h4>
utf8::replace_invalid utf8::replace_invalid
</h4> </h4>
<p class="version">
Available in version 2.0 and later.
</p>
<p> <p>
Replaces all invalid UTF-8 sequences within a string with a replacement marker. Replaces all invalid UTF-8 sequences within a string with a replacement marker.
</p> </p>
@ -755,6 +812,9 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<h4> <h4>
utf8::is_bom utf8::is_bom
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
</p> </p>
@ -783,12 +843,15 @@ assert (bbom == <span class="literal">true</span>);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text. encoded text.
</p> </p>
<h3> <h3 id="typesutf8">
Types From utf8 Namespace Types From utf8 Namespace
</h3> </h3>
<h4> <h4>
utf8::iterator utf8::iterator
</h4> </h4>
<p class="version">
Available in version 2.0 and later.
</p>
<p> <p>
Adapts the underlying octet iterator to iterate over the sequence of code points, Adapts the underlying octet iterator to iterate over the sequence of code points,
rather than raw octets. rather than raw octets.
@ -862,12 +925,15 @@ assert (*it == <span class="literal">0x10346</span>);
std::string s = <span class="literal">"example"</span>; std::string s = <span class="literal">"example"</span>;
utf8::iterator i (s.begin(), s.begin(), s.end()); utf8::iterator i (s.begin(), s.begin(), s.end());
</pre> </pre>
<h3> <h3 id="fununchecked">
Functions From utf8::unchecked Namespace Functions From utf8::unchecked Namespace
</h3> </h3>
<h4> <h4>
utf8::unchecked::append utf8::unchecked::append
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
to a UTF-8 string. to a UTF-8 string.
@ -910,6 +976,9 @@ assert (u[<span class="literal">0</span>] == <span class=
<h4> <h4>
utf8::unchecked::next utf8::unchecked::next
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Given the iterator to the beginning of a UTF-8 sequence, it returns the code point Given the iterator to the beginning of a UTF-8 sequence, it returns the code point
and moves the iterator to the next position. and moves the iterator to the next position.
@ -945,6 +1014,9 @@ assert (w == twochars + <span class="literal">3</span>);
<h4> <h4>
utf8::unchecked::prior utf8::unchecked::prior
</h4> </h4>
<p class="version">
Available in version 1.02 and later.
</p>
<p> <p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded decreases the iterator until it hits the beginning of the previous UTF-8 encoded
@ -981,6 +1053,9 @@ assert (w == twochars);
<h4> <h4>
utf8::unchecked::previous (deprecated, see utf8::unchecked::prior) utf8::unchecked::previous (deprecated, see utf8::unchecked::prior)
</h4> </h4>
<p class="version">
Deprecated in version 1.02 and later.
</p>
<p> <p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded decreases the iterator until it hits the beginning of the previous UTF-8 encoded
@ -1023,6 +1098,9 @@ assert (w == twochars);
<h4> <h4>
utf8::unchecked::advance utf8::unchecked::advance
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Advances an iterator by the specified number of code points within an UTF-8 Advances an iterator by the specified number of code points within an UTF-8
sequence. sequence.
@ -1061,6 +1139,9 @@ assert (w == twochars + <span class="literal">5</span>);
<h4> <h4>
utf8::unchecked::distance utf8::unchecked::distance
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Given the iterators to two UTF-8 encoded code points in a seqence, returns the Given the iterators to two UTF-8 encoded code points in a seqence, returns the
number of code points between them. number of code points between them.
@ -1096,6 +1177,9 @@ assert (dist == <span class="literal">2</span>);
<h4> <h4>
utf8::unchecked::utf16to8 utf8::unchecked::utf16to8
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-16 encoded string to UTF-8. Converts a UTF-16 encoded string to UTF-8.
</p> </p>
@ -1136,6 +1220,9 @@ assert (utf8result.size() == <span class="literal">10</span>);
<h4> <h4>
utf8::unchecked::utf8to16 utf8::unchecked::utf8to16
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts an UTF-8 encoded string to UTF-16 Converts an UTF-8 encoded string to UTF-16
</p> </p>
@ -1176,6 +1263,9 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
<h4> <h4>
utf8::unchecked::utf32to8 utf8::unchecked::utf32to8
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-32 encoded string to UTF-8. Converts a UTF-32 encoded string to UTF-8.
</p> </p>
@ -1215,6 +1305,9 @@ assert (utf8result.size() == <span class="literal">9</span>);
<h4> <h4>
utf8::unchecked::utf8to32 utf8::unchecked::utf8to32
</h4> </h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p> <p>
Converts a UTF-8 encoded string to UTF-32. Converts a UTF-8 encoded string to UTF-32.
</p> </p>
@ -1249,12 +1342,15 @@ assert (utf32result.size() == <span class="literal">2</span>);
This is a faster but less safe version of <code>utf8::utf8to32</code>. It does not This is a faster but less safe version of <code>utf8::utf8to32</code>. It does not
check for validity of the supplied UTF-8 sequence. check for validity of the supplied UTF-8 sequence.
</p> </p>
<h3> <h3 id="typesunchecked">
Types From utf8::unchecked Namespace Types From utf8::unchecked Namespace
</h3> </h3>
<h4> <h4>
utf8::iterator utf8::iterator
</h4> </h4>
<p class="version">
Available in version 2.0 and later.
</p>
<p> <p>
Adapts the underlying octet iterator to iterate over the sequence of code points, Adapts the underlying octet iterator to iterate over the sequence of code points,
rather than raw octets. rather than raw octets.
@ -1380,8 +1476,8 @@ assert (*un_it == <span class="literal">0x10346</span>);
use other means to work with UTF-8 strings. Template functions I describe in this use other means to work with UTF-8 strings. Template functions I describe in this
article may be a good step in this direction. article may be a good step in this direction.
</p> </p>
<h2 id="references"> <h2 id="links">
References Links
</h2> </h2>
<ol> <ol>
<li> <li>

View file

@ -273,7 +273,7 @@ namespace utf8
} }
bool operator == (const iterator& rhs) const bool operator == (const iterator& rhs) const
{ {
if (range_start != rhs.range_start && range_end != rhs.range_end) if (range_start != rhs.range_start || range_end != rhs.range_end)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
return (it == rhs.it); return (it == rhs.it);
} }

View file

@ -34,6 +34,12 @@ int main()
const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count); const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count);
if (u == INVALID_LINES_END) if (u == INVALID_LINES_END)
cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
// try fixing it:
string fixed_line;
replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
if (!is_valid(fixed_line.begin(), fixed_line.end()))
cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
} }
} }
} }