Completed documentation for the exceptions. Fixed bug ID: 2960112: is_bom wording fix

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@112 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2010-04-17 17:09:40 +00:00 committed by King_DuckZ
parent 4bfad91501
commit e628e37223
3 changed files with 173 additions and 19 deletions

View file

@ -67,7 +67,7 @@
<li>
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
</li>
</li>
</ul>
<li>
<a href="#reference">Reference</a>
<ul class="toc">
@ -275,11 +275,12 @@ octet_iterator append(uint32_t cp, octet_iterator result);
</pre>
<p>
<code>cp</code>: A 32 bit integer representing a code point to append to the
<code>octet_iterator</code>: an output iterator.<br>
<code>cp</code>: a 32 bit integer representing a code point to append to the
sequence.<br>
<code>result</code>: An output iterator to the place in the sequence where to
<code>result</code>: an output iterator to the place in the sequence where to
append the code point.<br>
<span class="return_value">Return value</span>: An iterator pointing to the place
<span class="return_value">Return value</span>: an iterator pointing to the place
after the newly appended sequence.
</p>
<p>
@ -326,6 +327,7 @@ uint32_t next(octet_iterator&amp; it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.<br>
@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: an iterator pointing to the beginning of an UTF-8
encoded code point.<br>
<code>end</code>: end of the UTF-8 sequence to be processed. If <code>it</code>
@ -400,7 +403,7 @@ assert (w == twochars);
Available in version 1.02 and later.
</p>
<p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@ -411,6 +414,7 @@ uint32_t prior(octet_iterator&amp; it, octet_iterator start);
</pre>
<p>
<code>octet_iterator</code>: a bidirectional iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@ -469,6 +473,7 @@ uint32_t previous(octet_iterator&amp; it, octet_iterator pass_start);
</pre>
<p>
<code>octet_iterator</code>: a random access iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@ -529,6 +534,8 @@ assert (w == twochars);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>distance_type</code>: an integral type convertible to <code>octet_iterator</code>'s difference type.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.<br>
@ -574,8 +581,9 @@ assert (w == twochars + <span class="literal">5</span>);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>first</code>: an iterator to a beginning of a UTF-8 encoded code point.<br>
<code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
<code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
point in the sequence we are trying to determine the length. It can be the
beginning of a new code point, or not.<br>
<span class="return_value">Return value</span> the distance between the iterators,
@ -619,6 +627,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
</pre>
<p>
<code>u16bit_iterator</code>: an input iterator.<br>
<code>octet_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-16 encoded
@ -661,6 +671,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>u16bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. &lt; br /&gt; <code>end</code>: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.<br>
@ -705,6 +717,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
</pre>
<p>
<code>octet_iterator</code>: an output iterator.<br>
<code>u32bit_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-32 encoded
@ -747,6 +761,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>u32bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 encoded string
@ -787,6 +803,7 @@ assert (utf32result.size() == <span class="literal">2</span>);
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@ -827,6 +844,7 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@ -868,6 +886,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>output_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to look
@ -904,12 +924,48 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
<code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
utf8::starts_with_bom
</h4>
<p class="version">
Available in version 2.3 and later. Relaces deprecated <code>is_bom()</code> function.
</p>
<p>
Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)
</p>
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> starts_with_bom (octet_iterator it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: beginning of the octet sequence to check<br>
<code>end</code>: pass-end of the sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
starts with a UTF-8 byte order mark; <code>false</code> if not.
</p>
<p>
Example of use:
</p>
<pre>
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
"literal">0xbf</span>};
<span class="keyword">bool</span> bbom = starts_with_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
assert (bbom == <span class="literal">true</span>);
</pre>
<p>
The typical use of this function is to check the first three bytes of a file. If
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
</p>
<h4>
utf8::is_bom
</h4>
<p class="version">
Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
and later (the two argument version).
Available in version 1.0 and later. Deprecated in version 2.3. <code>starts_with_bom()</code> should be used
instead.
</p>
<p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -917,14 +973,11 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: beginning of the 3-octet sequence to check<br>
<code>end</code>: pass-end of the sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
is UTF-8 byte order mark; <code>false</code> if not.
</p>
@ -944,13 +997,112 @@ assert (bbom == <span class="literal">true</span>);
encoded text.
</p>
<p>
The older version of the function that takes only one argument is unsafe: if a sequence is
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
in favor of the safer version that takes the end of sequence as an argument.
If a sequence is
shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
in favor of <code>starts_with_bom()</code>that takes the end of sequence as an argument.
</p>
<h3 id="typesutf8">
Types From utf8 Namespace
</h3>
<h4>utf8::exception
</h4>
<p class="version">
Available in version 2.3 and later.
</p>
<p>
Base class for the exceptions thrown by UTF CPP library functions.
</p>
<pre>
<span class="keyword">class</span> exception : <span class="keyword">public</span> std::exception {};
</pre>
<p>
Example of use:
</p>
<pre>
<span class="keyword">try</span> {
code_that_uses_utf_cpp_library();
}
<span class="keyword">catch</span>(<span class="keyword">const</span> utf8::exception&amp; utfcpp_ex) {
cerr &lt;&lt; utfcpp_ex.what();
}
</pre>
<h4>utf8::invalid_code_point
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>advance</code> and <code>next</code> if an UTF-8 sequence represents and invalid code point.
</p>
<pre>
<span class="keyword">class</span> invalid_code_point : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint32_t code_point() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>code_point()</code> can be used to determine the invalid code point that
caused the exception to be thrown.
</p>
<h4>utf8::invalid_utf8
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>next</code> and <code>prior</code> if an invalid UTF-8 sequence
is detected during decoding.
</p>
<pre>
<span class="keyword">class</span> invalid_utf8 : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint8_t utf8_octet() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>utf8_octet()</code> can be used to determine the beginning of the byte
sequence that caused the exception to be thrown.
</p>
</pre>
<h4>utf8::invalid_utf16
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP function <code>utf16to8</code> if an invalid UTF-16 sequence
is detected during decoding.
</p>
<pre>
<span class="keyword">class</span> invalid_utf16 : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint16_t utf16_word() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>utf16_word()</code> can be used to determine the UTF-16 code unit
that caused the exception to be thrown.
</p>
<h4>utf8::not_enough_room
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>next</code> if the end of the decoded UTF-8 sequence
was reached before the code point was decoded.
</p>
<pre>
<span class="keyword">class</span> not_enough_room : <span class="keyword">public</span> exception {};
</pre>
<h4>
utf8::iterator
</h4>

View file

@ -332,7 +332,7 @@ namespace internal
}
template <typename octet_iterator>
inline bool is_bom (octet_iterator it, octet_iterator end)
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (internal::mask8(*it++)) == bom[0]) &&

View file

@ -139,12 +139,14 @@ int main()
bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
assert (bvalid == true);
//is_bom
//starts_with_bom
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true);
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
assert (no_bbom == false);
//is_bom
bool unsafe_bbom = is_bom(byte_order_mark);
assert (unsafe_bbom == true);