Updated documentation. Fixed a small bug in checked.h. Added new checks to the negative tests
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@78 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
cd3092c0ca
commit
baf63b327a
3 changed files with 112 additions and 10 deletions
|
@ -33,6 +33,10 @@
|
|||
ul.toc {
|
||||
list-style-type: none;
|
||||
}
|
||||
p.version {
|
||||
font-size: small;
|
||||
font-style: italic;
|
||||
}
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
|
@ -56,6 +60,20 @@
|
|||
</li>
|
||||
<li>
|
||||
<a href="#reference">Reference</a>
|
||||
<ul class="toc">
|
||||
<li>
|
||||
<a href="#funutf8">Functions From utf8 Namespace </a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#typesutf8">Types From utf8 Namespace </a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#fununchecked">Functions From utf8::unchecked Namespace </a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#typesunchecked">Types From utf8::unchecked Namespace </a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#points">Points of Interest</a>
|
||||
|
@ -64,7 +82,7 @@
|
|||
<a href="#conclusion">Conclusion</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#references">References</a>
|
||||
<a href="#links">Links</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
@ -182,12 +200,15 @@
|
|||
<h2 id="reference">
|
||||
Reference
|
||||
</h2>
|
||||
<h3>
|
||||
<h3 id="funutf8">
|
||||
Functions From utf8 Namespace
|
||||
</h3>
|
||||
<h4>
|
||||
utf8::append
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
|
||||
to a UTF-8 string.
|
||||
|
@ -236,6 +257,9 @@ assert (u[<span class="literal">0</span>] == <span class=
|
|||
<h4>
|
||||
utf8::next
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given the iterator to the beginning of the UTF-8 sequence, it returns the code
|
||||
point and moves the iterator to the next position.
|
||||
|
@ -277,6 +301,9 @@ assert (w == twochars + <span class="literal">3</span>);
|
|||
<h4>
|
||||
utf8::prior
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.02 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
|
||||
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
|
||||
|
@ -330,8 +357,11 @@ assert (w == twochars);
|
|||
exception is thrown.
|
||||
</p>
|
||||
<h4>
|
||||
utf8::previous (deprecated, see utf8::prior)
|
||||
utf8::previous
|
||||
</h4>
|
||||
<p class="version">
|
||||
Deprecated in version 1.02 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
|
||||
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
|
||||
|
@ -389,6 +419,9 @@ assert (w == twochars);
|
|||
<h4>
|
||||
utf8::advance
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Advances an iterator by the specified number of code points within an UTF-8
|
||||
sequence.
|
||||
|
@ -431,6 +464,9 @@ assert (w == twochars + <span class="literal">5</span>);
|
|||
<h4>
|
||||
utf8::distance
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given the iterators to two UTF-8 encoded code points in a seqence, returns the
|
||||
number of code points between them.
|
||||
|
@ -474,6 +510,9 @@ assert (dist == <span class="literal">2</span>);
|
|||
<h4>
|
||||
utf8::utf16to8
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-16 encoded string to UTF-8.
|
||||
</p>
|
||||
|
@ -514,6 +553,9 @@ assert (utf8result.size() == <span class="literal">10</span>);
|
|||
<h4>
|
||||
utf8::utf8to16
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts an UTF-8 encoded string to UTF-16
|
||||
</p>
|
||||
|
@ -555,6 +597,9 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
|
|||
<h4>
|
||||
utf8::utf32to8
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
</p>
|
||||
|
@ -593,6 +638,9 @@ assert (utf8result.size() == <span class="literal">9</span>);
|
|||
<h4>
|
||||
utf8::utf8to32
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-8 encoded string to UTF-32.
|
||||
</p>
|
||||
|
@ -632,6 +680,9 @@ assert (utf32result.size() == <span class="literal">2</span>);
|
|||
<h4>
|
||||
utf8::find_invalid
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Detects an invalid sequence within a UTF-8 string.
|
||||
</p>
|
||||
|
@ -668,6 +719,9 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
|
|||
<h4>
|
||||
utf8::is_valid
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Checks whether a sequence of octets is a valid UTF-8 string.
|
||||
</p>
|
||||
|
@ -701,6 +755,9 @@ assert (bvalid == false);
|
|||
<h4>
|
||||
utf8::replace_invalid
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 2.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Replaces all invalid UTF-8 sequences within a string with a replacement marker.
|
||||
</p>
|
||||
|
@ -755,6 +812,9 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
<h4>
|
||||
utf8::is_bom
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
|
||||
</p>
|
||||
|
@ -783,12 +843,15 @@ assert (bbom == <span class="literal">true</span>);
|
|||
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
|
||||
encoded text.
|
||||
</p>
|
||||
<h3>
|
||||
<h3 id="typesutf8">
|
||||
Types From utf8 Namespace
|
||||
</h3>
|
||||
<h4>
|
||||
utf8::iterator
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 2.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Adapts the underlying octet iterator to iterate over the sequence of code points,
|
||||
rather than raw octets.
|
||||
|
@ -862,12 +925,15 @@ assert (*it == <span class="literal">0x10346</span>);
|
|||
std::string s = <span class="literal">"example"</span>;
|
||||
utf8::iterator i (s.begin(), s.begin(), s.end());
|
||||
</pre>
|
||||
<h3>
|
||||
<h3 id="fununchecked">
|
||||
Functions From utf8::unchecked Namespace
|
||||
</h3>
|
||||
<h4>
|
||||
utf8::unchecked::append
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
|
||||
to a UTF-8 string.
|
||||
|
@ -910,6 +976,9 @@ assert (u[<span class="literal">0</span>] == <span class=
|
|||
<h4>
|
||||
utf8::unchecked::next
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given the iterator to the beginning of a UTF-8 sequence, it returns the code point
|
||||
and moves the iterator to the next position.
|
||||
|
@ -945,6 +1014,9 @@ assert (w == twochars + <span class="literal">3</span>);
|
|||
<h4>
|
||||
utf8::unchecked::prior
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.02 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
|
||||
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
|
||||
|
@ -981,6 +1053,9 @@ assert (w == twochars);
|
|||
<h4>
|
||||
utf8::unchecked::previous (deprecated, see utf8::unchecked::prior)
|
||||
</h4>
|
||||
<p class="version">
|
||||
Deprecated in version 1.02 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
|
||||
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
|
||||
|
@ -1023,6 +1098,9 @@ assert (w == twochars);
|
|||
<h4>
|
||||
utf8::unchecked::advance
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Advances an iterator by the specified number of code points within an UTF-8
|
||||
sequence.
|
||||
|
@ -1061,6 +1139,9 @@ assert (w == twochars + <span class="literal">5</span>);
|
|||
<h4>
|
||||
utf8::unchecked::distance
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given the iterators to two UTF-8 encoded code points in a seqence, returns the
|
||||
number of code points between them.
|
||||
|
@ -1096,6 +1177,9 @@ assert (dist == <span class="literal">2</span>);
|
|||
<h4>
|
||||
utf8::unchecked::utf16to8
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-16 encoded string to UTF-8.
|
||||
</p>
|
||||
|
@ -1136,6 +1220,9 @@ assert (utf8result.size() == <span class="literal">10</span>);
|
|||
<h4>
|
||||
utf8::unchecked::utf8to16
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts an UTF-8 encoded string to UTF-16
|
||||
</p>
|
||||
|
@ -1176,6 +1263,9 @@ assert (utf16result[<span class="literal">3</span>] == <span class=
|
|||
<h4>
|
||||
utf8::unchecked::utf32to8
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
</p>
|
||||
|
@ -1215,6 +1305,9 @@ assert (utf8result.size() == <span class="literal">9</span>);
|
|||
<h4>
|
||||
utf8::unchecked::utf8to32
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Converts a UTF-8 encoded string to UTF-32.
|
||||
</p>
|
||||
|
@ -1249,12 +1342,15 @@ assert (utf32result.size() == <span class="literal">2</span>);
|
|||
This is a faster but less safe version of <code>utf8::utf8to32</code>. It does not
|
||||
check for validity of the supplied UTF-8 sequence.
|
||||
</p>
|
||||
<h3>
|
||||
<h3 id="typesunchecked">
|
||||
Types From utf8::unchecked Namespace
|
||||
</h3>
|
||||
<h4>
|
||||
utf8::iterator
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 2.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Adapts the underlying octet iterator to iterate over the sequence of code points,
|
||||
rather than raw octets.
|
||||
|
@ -1380,8 +1476,8 @@ assert (*un_it == <span class="literal">0x10346</span>);
|
|||
use other means to work with UTF-8 strings. Template functions I describe in this
|
||||
article may be a good step in this direction.
|
||||
</p>
|
||||
<h2 id="references">
|
||||
References
|
||||
<h2 id="links">
|
||||
Links
|
||||
</h2>
|
||||
<ol>
|
||||
<li>
|
||||
|
|
|
@ -262,7 +262,7 @@ namespace utf8
|
|||
it(octet_it), range_start(range_start), range_end(range_end)
|
||||
{
|
||||
if (it < range_start || it > range_end)
|
||||
throw std::out_of_range("Invalid utf-8 iterator position");
|
||||
throw std::out_of_range("Invalid utf-8 iterator position");
|
||||
}
|
||||
// the default "big three" are OK
|
||||
octet_iterator base () const { return it; }
|
||||
|
@ -273,7 +273,7 @@ namespace utf8
|
|||
}
|
||||
bool operator == (const iterator& rhs) const
|
||||
{
|
||||
if (range_start != rhs.range_start && range_end != rhs.range_end)
|
||||
if (range_start != rhs.range_start || range_end != rhs.range_end)
|
||||
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
|
||||
return (it == rhs.it);
|
||||
}
|
||||
|
|
|
@ -34,6 +34,12 @@ int main()
|
|||
const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count);
|
||||
if (u == INVALID_LINES_END)
|
||||
cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
|
||||
|
||||
// try fixing it:
|
||||
string fixed_line;
|
||||
replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
|
||||
if (!is_valid(fixed_line.begin(), fixed_line.end()))
|
||||
cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue