From 4a3b41e9f1a2b7b46fdd7a6c5f61cce706ee291d Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Sun, 5 Jul 2009 21:14:40 +0000 Subject: [PATCH] Updated documentation to include additional samples. Fixed a typo in core.h git-svn-id: http://svn.code.sf.net/p/utfcpp/code@92 a809a056-fc17-0410-9590-b4f493f8b08e --- doc/utf8cpp.html | 45 ++++++++++++++++++++++++++++++++++++++++----- source/utf8/core.h | 2 +- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index ed6de70..c528541 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -61,6 +61,12 @@
  • Introductionary Sample
  • +
  • + Checking if a file contains valid UTF-8 text +
  • +
  • + Ensure that a string contains valid UTF-8 text +
  • Reference @@ -140,6 +146,7 @@ cout << "\nUsage: docsample filename\n"; return 0; } + const char* test_file_path = argv[1]; // Open the test file (contains UTF-8 encoded text) ifstream fs8(test_file_path); @@ -148,6 +155,7 @@ "literal">"Could not open " << test_file_path << endl; return 0; } + unsigned line_count = 1; string line; // Play with all the lines in the file @@ -162,37 +170,64 @@ "literal">"This part is fine: " << string(line.begin(), end_it) << "\n"; } + // Get the line length (at least for the valid part) int length = utf8::distance(line.begin(), end_it); cout << "Length of line " << line_count << " is " << length << "\n"; + // Convert it to utf-16 vector<unsigned short> utf16line; utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); + // And back to utf-8 string utf8line; utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); + // Confirm that the conversion went OK: if (utf8line != string(line.begin(), end_it)) cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + line_count++; } return 0; - +}

    - In the previous code sample, we have seen the use of the following functions from - utf8 namespace: first we used is_bom function to detect - UTF-8 byte order mark at the beginning of the file; then for each line we performed + In the previous code sample, for each line we performed a detection of invalid UTF-8 sequences with find_invalid; the number - of characters (more precisely - the number of Unicode code points) in each line was + of characters (more precisely - the number of Unicode code points, including the end + of line and even BOM if there is one) in each line was determined with a use of utf8::distance; finally, we have converted each line to UTF-16 encoding with utf8to16 and back to UTF-8 with utf16to8.

    +

    Checking if a file contains valid UTF-8 text

    +
        
    +bool valid_utf8_file(iconst char* file_name)
    +{
    +    ifstream ifs(file_name);
    +    if (!ifs)
    +        return false; // even better, throw here
    +
    +    istreambuf_iterator<char> it(ifs.rdbuf());
    +    istreambuf_iterator<char> eos;
    +
    +    return utf8::is_valid(it, eos);
    +}
    +
    +

    Ensure that a string contains valid UTF-8 text

    +
    +void fix_utf8_string(std::string& str)
    +{
    +    std::string temp;
    +    utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
    +    str = temp;
    +}
    +

    Reference

    diff --git a/source/utf8/core.h b/source/utf8/core.h index 3428fc1..d72f743 100644 --- a/source/utf8/core.h +++ b/source/utf8/core.h @@ -267,7 +267,7 @@ namespace internal } if (err == UTF8_OK) { - // Decoding suceeded. Now, security checks... + // Decoding succeeded. Now, security checks... if (is_code_point_valid(cp)) { if (!is_overlong_sequence(cp, length)){ // Passed! Return here.