Updated documentation to include additional samples. Fixed a typo in core.h
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@92 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
33dc220fae
commit
4a3b41e9f1
2 changed files with 41 additions and 6 deletions
|
@ -61,6 +61,12 @@
|
|||
<li>
|
||||
<a href=#introsample>Introductionary Sample </a>
|
||||
</li>
|
||||
<li>
|
||||
<a href=#validfile>Checking if a file contains valid UTF-8 text</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
|
||||
</li>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#reference">Reference</a>
|
||||
|
@ -140,6 +146,7 @@
|
|||
cout << <span class="literal">"\nUsage: docsample filename\n"</span>;
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
}
|
||||
|
||||
<span class="keyword">const char</span>* test_file_path = argv[1];
|
||||
<span class="comment">// Open the test file (contains UTF-8 encoded text)</span>
|
||||
ifstream fs8(test_file_path);
|
||||
|
@ -148,6 +155,7 @@
|
|||
"literal">"Could not open "</span> << test_file_path << endl;
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
}
|
||||
|
||||
<span class="keyword">unsigned</span> line_count = <span class="literal">1</span>;
|
||||
string line;
|
||||
<span class="comment">// Play with all the lines in the file</span>
|
||||
|
@ -162,37 +170,64 @@
|
|||
"literal">"This part is fine: "</span> << string(line.begin(), end_it) << <span
|
||||
class="literal">"\n"</span>;
|
||||
}
|
||||
|
||||
<span class="comment">// Get the line length (at least for the valid part)</span>
|
||||
<span class="keyword">int</span> length = utf8::distance(line.begin(), end_it);
|
||||
cout << <span class=
|
||||
"literal">"Length of line "</span> << line_count << <span class=
|
||||
"literal">" is "</span> << length << <span class="literal">"\n"</span>;
|
||||
|
||||
<span class="comment">// Convert it to utf-16</span>
|
||||
vector<unsigned short> utf16line;
|
||||
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
|
||||
|
||||
<span class="comment">// And back to utf-8</span>
|
||||
string utf8line;
|
||||
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
|
||||
|
||||
<span class="comment">// Confirm that the conversion went OK:</span>
|
||||
<span class="keyword">if</span> (utf8line != string(line.begin(), end_it))
|
||||
cout << <span class=
|
||||
"literal">"Error in UTF-16 conversion at line: "</span> << line_count << <span
|
||||
class="literal">"\n"</span>;
|
||||
|
||||
line_count++;
|
||||
}
|
||||
<span class="keyword">return</span> <span class="literal">0</span>;
|
||||
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
In the previous code sample, we have seen the use of the following functions from
|
||||
<code>utf8</code> namespace: first we used <code>is_bom</code> function to detect
|
||||
UTF-8 byte order mark at the beginning of the file; then for each line we performed
|
||||
In the previous code sample, for each line we performed
|
||||
a detection of invalid UTF-8 sequences with <code>find_invalid</code>; the number
|
||||
of characters (more precisely - the number of Unicode code points) in each line was
|
||||
of characters (more precisely - the number of Unicode code points, including the end
|
||||
of line and even BOM if there is one) in each line was
|
||||
determined with a use of <code>utf8::distance</code>; finally, we have converted
|
||||
each line to UTF-16 encoding with <code>utf8to16</code> and back to UTF-8 with
|
||||
<code>utf16to8</code>.
|
||||
</p>
|
||||
<h3 id="validfile">Checking if a file contains valid UTF-8 text</h3>
|
||||
<pre>
|
||||
<span class="keyword">bool</span> valid_utf8_file(i<span class="keyword">const char</span>* file_name)
|
||||
{
|
||||
ifstream ifs(file_name);
|
||||
<span class="keyword">if</span> (!ifs)
|
||||
<span class="keyword">return false</span>; <span class="comment">// even better, throw here</span>
|
||||
|
||||
istreambuf_iterator<<span class="keyword">char</span>> it(ifs.rdbuf());
|
||||
istreambuf_iterator<<span class="keyword">char</span>> eos;
|
||||
|
||||
<span class="keyword">return</span> utf8::is_valid(it, eos);
|
||||
}
|
||||
</pre>
|
||||
<h3 id="fixinvalid">Ensure that a string contains valid UTF-8 text</h3>
|
||||
<pre>
|
||||
<span class="keyword">void</span> fix_utf8_string(std::string& str)
|
||||
{
|
||||
std::string temp;
|
||||
utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
|
||||
str = temp;
|
||||
}
|
||||
</pre>
|
||||
<h2 id="reference">
|
||||
Reference
|
||||
</h2>
|
||||
|
|
|
@ -267,7 +267,7 @@ namespace internal
|
|||
}
|
||||
|
||||
if (err == UTF8_OK) {
|
||||
// Decoding suceeded. Now, security checks...
|
||||
// Decoding succeeded. Now, security checks...
|
||||
if (is_code_point_valid(cp)) {
|
||||
if (!is_overlong_sequence(cp, length)){
|
||||
// Passed! Return here.
|
||||
|
|
Loading…
Reference in a new issue