Completed documentation for the exceptions. Fixed bug ID: 2960112: is_bom wording fix
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@112 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
4bfad91501
commit
e628e37223
3 changed files with 173 additions and 19 deletions
180
doc/utf8cpp.html
180
doc/utf8cpp.html
|
@ -67,7 +67,7 @@
|
|||
<li>
|
||||
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
|
||||
</li>
|
||||
</li>
|
||||
</ul>
|
||||
<li>
|
||||
<a href="#reference">Reference</a>
|
||||
<ul class="toc">
|
||||
|
@ -275,11 +275,12 @@ octet_iterator append(uint32_t cp, octet_iterator result);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>cp</code>: A 32 bit integer representing a code point to append to the
|
||||
<code>octet_iterator</code>: an output iterator.<br>
|
||||
<code>cp</code>: a 32 bit integer representing a code point to append to the
|
||||
sequence.<br>
|
||||
<code>result</code>: An output iterator to the place in the sequence where to
|
||||
<code>result</code>: an output iterator to the place in the sequence where to
|
||||
append the code point.<br>
|
||||
<span class="return_value">Return value</span>: An iterator pointing to the place
|
||||
<span class="return_value">Return value</span>: an iterator pointing to the place
|
||||
after the newly appended sequence.
|
||||
</p>
|
||||
<p>
|
||||
|
@ -326,6 +327,7 @@ uint32_t next(octet_iterator& it, octet_iterator end);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
|
||||
encoded code point. After the function returns, it is incremented to point to the
|
||||
beginning of the next code point.<br>
|
||||
|
@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>it</code>: an iterator pointing to the beginning of an UTF-8
|
||||
encoded code point.<br>
|
||||
<code>end</code>: end of the UTF-8 sequence to be processed. If <code>it</code>
|
||||
|
@ -400,7 +403,7 @@ assert (w == twochars);
|
|||
Available in version 1.02 and later.
|
||||
</p>
|
||||
<p>
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
|
||||
Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
|
||||
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
|
||||
code point and returns the 32 bits representation of the code point.
|
||||
</p>
|
||||
|
@ -411,6 +414,7 @@ uint32_t prior(octet_iterator& it, octet_iterator start);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: a bidirectional iterator.<br>
|
||||
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
|
||||
After the function returns, it is decremented to point to the beginning of the
|
||||
previous code point.<br>
|
||||
|
@ -469,6 +473,7 @@ uint32_t previous(octet_iterator& it, octet_iterator pass_start);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: a random access iterator.<br>
|
||||
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
|
||||
After the function returns, it is decremented to point to the beginning of the
|
||||
previous code point.<br>
|
||||
|
@ -529,6 +534,8 @@ assert (w == twochars);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>distance_type</code>: an integral type convertible to <code>octet_iterator</code>'s difference type.<br>
|
||||
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
|
||||
encoded code point. After the function returns, it is incremented to point to the
|
||||
nth following code point.<br>
|
||||
|
@ -574,6 +581,7 @@ assert (w == twochars + <span class="literal">5</span>);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>first</code>: an iterator to a beginning of a UTF-8 encoded code point.<br>
|
||||
<code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
|
||||
point in the sequence we are trying to determine the length. It can be the
|
||||
|
@ -619,6 +627,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>u16bit_iterator</code>: an input iterator.<br>
|
||||
<code>octet_iterator</code>: an output iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-16 encoded
|
||||
string to convert.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-16 encoded
|
||||
|
@ -661,6 +671,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>u16bit_iterator</code>: an output iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
|
||||
string to convert. < br /> <code>end</code>: an iterator pointing to
|
||||
pass-the-end of the UTF-8 encoded string to convert.<br>
|
||||
|
@ -705,6 +717,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an output iterator.<br>
|
||||
<code>u32bit_iterator</code>: an input iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-32 encoded
|
||||
string to convert.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-32 encoded
|
||||
|
@ -747,6 +761,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>u32bit_iterator</code>: an output iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
|
||||
string to convert.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 encoded string
|
||||
|
@ -787,6 +803,7 @@ assert (utf32result.size() == <span class="literal">2</span>);
|
|||
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
|
||||
test for validity.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
|
||||
|
@ -827,6 +844,7 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
|
||||
test for validity.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
|
||||
|
@ -868,6 +886,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
|
|||
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>output_iterator</code>: an output iterator.<br>
|
||||
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
|
||||
look for invalid UTF-8 sequences.<br>
|
||||
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to look
|
||||
|
@ -904,12 +924,48 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
|
||||
<code>utf8::not_enough_room</code> exception is thrown.
|
||||
</p>
|
||||
<h4>
|
||||
utf8::starts_with_bom
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 2.3 and later. Relaces deprecated <code>is_bom()</code> function.
|
||||
</p>
|
||||
<p>
|
||||
Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">template</span> <<span class=
|
||||
"keyword">typename</span> octet_iterator>
|
||||
<span class="keyword">bool</span> starts_with_bom (octet_iterator it, octet_iterator end);
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>it</code>: beginning of the octet sequence to check<br>
|
||||
<code>end</code>: pass-end of the sequence to check<br>
|
||||
<span class="return_value">Return value</span>: <code>true</code> if the sequence
|
||||
starts with a UTF-8 byte order mark; <code>false</code> if not.
|
||||
</p>
|
||||
<p>
|
||||
Example of use:
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
|
||||
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
|
||||
"literal">0xbf</span>};
|
||||
<span class="keyword">bool</span> bbom = starts_with_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
|
||||
assert (bbom == <span class="literal">true</span>);
|
||||
</pre>
|
||||
<p>
|
||||
The typical use of this function is to check the first three bytes of a file. If
|
||||
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
|
||||
encoded text.
|
||||
</p>
|
||||
<h4>
|
||||
utf8::is_bom
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later (the deprecated one argument version) and in version 2.3
|
||||
and later (the two argument version).
|
||||
Available in version 1.0 and later. Deprecated in version 2.3. <code>starts_with_bom()</code> should be used
|
||||
instead.
|
||||
</p>
|
||||
<p>
|
||||
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
|
||||
|
@ -917,14 +973,11 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
|
|||
<pre>
|
||||
<span class="keyword">template</span> <<span class=
|
||||
"keyword">typename</span> octet_iterator>
|
||||
<span class="keyword">bool</span> is_bom (octet_iterator it, octet_iterator end);
|
||||
<span class="keyword">template</span> <<span class=
|
||||
"keyword">typename</span> octet_iterator>
|
||||
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
|
||||
</pre>
|
||||
<p>
|
||||
<code>octet_iterator</code>: an input iterator.<br>
|
||||
<code>it</code>: beginning of the 3-octet sequence to check<br>
|
||||
<code>end</code>: pass-end of the sequence to check<br>
|
||||
<span class="return_value">Return value</span>: <code>true</code> if the sequence
|
||||
is UTF-8 byte order mark; <code>false</code> if not.
|
||||
</p>
|
||||
|
@ -944,13 +997,112 @@ assert (bbom == <span class="literal">true</span>);
|
|||
encoded text.
|
||||
</p>
|
||||
<p>
|
||||
The older version of the function that takes only one argument is unsafe: if a sequence is
|
||||
shorter than three bytes, an invalid iterator will be dereferenced. Therefore it is deprecated
|
||||
in favor of the safer version that takes the end of sequence as an argument.
|
||||
If a sequence is
|
||||
shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
|
||||
in favor of <code>starts_with_bom()</code>that takes the end of sequence as an argument.
|
||||
</p>
|
||||
<h3 id="typesutf8">
|
||||
Types From utf8 Namespace
|
||||
</h3>
|
||||
<h4>utf8::exception
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 2.3 and later.
|
||||
</p>
|
||||
<p>
|
||||
Base class for the exceptions thrown by UTF CPP library functions.
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">class</span> exception : <span class="keyword">public</span> std::exception {};
|
||||
</pre>
|
||||
<p>
|
||||
Example of use:
|
||||
</p>
|
||||
<pre>
|
||||
<span class="keyword">try</span> {
|
||||
code_that_uses_utf_cpp_library();
|
||||
}
|
||||
<span class="keyword">catch</span>(<span class="keyword">const</span> utf8::exception& utfcpp_ex) {
|
||||
cerr << utfcpp_ex.what();
|
||||
}
|
||||
</pre>
|
||||
|
||||
<h4>utf8::invalid_code_point
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Thrown by UTF8 CPP functions such as <code>advance</code> and <code>next</code> if an UTF-8 sequence represents and invalid code point.
|
||||
</p>
|
||||
|
||||
<pre>
|
||||
<span class="keyword">class</span> invalid_code_point : <span class="keyword">public</span> exception {
|
||||
<span class="keyword">public</span>:
|
||||
uint32_t code_point() <span class="keyword">const</span>;
|
||||
};
|
||||
|
||||
</pre>
|
||||
<p>
|
||||
Member function <code>code_point()</code> can be used to determine the invalid code point that
|
||||
caused the exception to be thrown.
|
||||
</p>
|
||||
<h4>utf8::invalid_utf8
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Thrown by UTF8 CPP functions such as <code>next</code> and <code>prior</code> if an invalid UTF-8 sequence
|
||||
is detected during decoding.
|
||||
</p>
|
||||
|
||||
<pre>
|
||||
<span class="keyword">class</span> invalid_utf8 : <span class="keyword">public</span> exception {
|
||||
<span class="keyword">public</span>:
|
||||
uint8_t utf8_octet() <span class="keyword">const</span>;
|
||||
};
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
Member function <code>utf8_octet()</code> can be used to determine the beginning of the byte
|
||||
sequence that caused the exception to be thrown.
|
||||
</p>
|
||||
</pre>
|
||||
<h4>utf8::invalid_utf16
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Thrown by UTF8 CPP function <code>utf16to8</code> if an invalid UTF-16 sequence
|
||||
is detected during decoding.
|
||||
</p>
|
||||
|
||||
<pre>
|
||||
<span class="keyword">class</span> invalid_utf16 : <span class="keyword">public</span> exception {
|
||||
<span class="keyword">public</span>:
|
||||
uint16_t utf16_word() <span class="keyword">const</span>;
|
||||
};
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
Member function <code>utf16_word()</code> can be used to determine the UTF-16 code unit
|
||||
that caused the exception to be thrown.
|
||||
</p>
|
||||
<h4>utf8::not_enough_room
|
||||
</h4>
|
||||
<p class="version">
|
||||
Available in version 1.0 and later.
|
||||
</p>
|
||||
<p>
|
||||
Thrown by UTF8 CPP functions such as <code>next</code> if the end of the decoded UTF-8 sequence
|
||||
was reached before the code point was decoded.
|
||||
</p>
|
||||
|
||||
<pre>
|
||||
<span class="keyword">class</span> not_enough_room : <span class="keyword">public</span> exception {};
|
||||
</pre>
|
||||
<h4>
|
||||
utf8::iterator
|
||||
</h4>
|
||||
|
|
|
@ -332,7 +332,7 @@ namespace internal
|
|||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
inline bool is_bom (octet_iterator it, octet_iterator end)
|
||||
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
|
||||
{
|
||||
return (
|
||||
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
|
||||
|
|
|
@ -139,12 +139,14 @@ int main()
|
|||
bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
|
||||
assert (bvalid == true);
|
||||
|
||||
//is_bom
|
||||
//starts_with_bom
|
||||
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
|
||||
bool bbom = is_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
|
||||
bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
|
||||
assert (bbom == true);
|
||||
bool no_bbom = is_bom(threechars, threechars + sizeof(threechars));
|
||||
bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
|
||||
assert (no_bbom == false);
|
||||
|
||||
//is_bom
|
||||
bool unsafe_bbom = is_bom(byte_order_mark);
|
||||
assert (unsafe_bbom == true);
|
||||
|
||||
|
|
Loading…
Reference in a new issue