Added a sample app, updated documentation, added clean target to the Makefile, added options Wall and pedantic to test drivers

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@57 a809a056-fc17-0410-9590-b4f493f8b08e
2006-09-09 20:39:45 +00:00 · 2006-09-09 20:39:45 +00:00 · 501d9a21c4
commit 501d9a21c4
parent dab1a9da5a
6 changed files with 126 additions and 49 deletions
--- a/doc/utf8cpp.html
+++ b/doc/utf8cpp.html
@ -3,7 +3,9 @@
 <head>
 <meta name="generator" content=
 "HTML Tidy for Linux/x86 (vers 12 April 2005), see www.w3.org">
-<title></title>
+<meta name="description" content="A simple, portable and lightweigt C++ library for easy handling of UTF-8 encoded strings">
 <meta name="keywords" content="UTF-8 C++ portable utf8 unicode generic templates">
 <title>UTF8-CPP: UTF-8 with C++ in a Portable Way</title>
 </head>
 <body>
 <p><a href="https://sourceforge.net/projects/utfcpp">The Sourceforge project page</a></p>
@ -42,9 +44,9 @@ want to handle UTF-8 encoded strings from C++, I am sure you have
 good reasons for it.</p>
 <h2 id="examples">Examples of use</h2>
 <p>To illustrate the use of this utf8 library, we shall open a file
-containing a line of UTF-8 encoded text, read the line into
+containing UTF-8 encoded text, check whether it starts with a byte order mark,
-<code>std::string</code>, convert the text to UTF-16, and write it
+read each line into a <code>std::string</code>, check it for validity, convert the text to UTF-16,
-to another file:</p>
+and back to UTF-8:</p>
 <pre>
 #include &lt;fstream&gt;
 #include &lt;iostream&gt;
@ -54,60 +56,69 @@ using namespace std;
 int main()
 {
-    // Open the file with a utf-8 encoded line of text in it
+    if (argc != 2) {
-    ifstream fs8("utf8.txt");
+        cout &lt;&lt; "\nUsage: docsample filename\n";
        return 0;
    }
    const char* test_file_path = argv[1];
    // Open the test file (must be UTF-8 encoded)
    ifstream fs8(test_file_path);
    if (!fs8.is_open()) {
-        cout &lt;&lt; "Could not open utf8.txt" &lt;&lt; endl;
+    cout &lt;&lt; "Could not open " &lt;&lt; test_file_path &lt;&lt; endl;
    return 0;
    }
    // Read the first line of the file
    unsigned line_count = 1;
    string line;
    if (!getline(fs8, line)) 
        return 0;
    // Look for utf-8 byte-order mark at the beginning
    if (line.size() &gt; 2) {
        if (utf8::is_bom(line.c_str()))
          cout &lt;&lt; "There is a byte order mark at the beginning of the file\n";
    }
-    // is there a utf8 marker? if yes, skip it.
+    // Play with all the lines in the file
-    fs8.seekg(0, ios::end);
+    do {
-    ifstream::pos_type file_length = fs8.tellg();
+        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
-    fs8.seekg(0, ios::beg);
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
-    if (file_length &gt; 3) {
+        if (end_it != line.end()) {
-        char bom[3];
+            cout &lt;&lt; "Invalid UTF-8 encoding detected at line " &lt;&lt; line_count &lt;&lt; "\n";
-        fs8.read(bom, 3);
+            cout &lt;&lt; "This part is fine: " &lt;&lt; string(line.begin(), end_it) &lt;&lt; "\n";
-        if (!utf8::is_bom(bom))
+        }
-            fs8.seekg(0, ios::beg);        
+        // Get the line length (at least for the valid part)
-    }
+        int length = utf8::distance(line.begin(), end_it);
        cout &lt;&lt; "Length of line " &lt;&lt; line_count &lt;&lt; " is " &lt;&lt; length &lt;&lt;  "\n";
-    // Read the line from the file
+        // Convert it to utf-16
-    string text8;
+        vector&lt;unsigned short&gt; utf16line;
-    getline(fs8, text8);
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
        // And back to utf-8;
        string utf8line; 
        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
        // Confirm that the conversion went OK:
        if (utf8line != string(line.begin(), end_it))
            cout &lt;&lt; "Error in UTF-16 conversion at line: " &lt;&lt; line_count &lt;&lt; "\n";        
-    // Make sure it is valid utf-8
+        getline(fs8, line);
-    if (!utf8::is_valid(text8.begin(), text8.end())) {
+        line_count++;
-        cout &lt;&lt; "Invalid utf-8 text";
+    } while (!fs8.eof());
        return 0;
    }
-    // Convert the text to utf-16
+    return 0;
    vector&lt;unsigned short&gt; text16;
    text16.push_back(0xfeff); // bom
    utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16));
    // Create  the file for writing the utf-16 string
    ofstream fs16("utf16.txt", ios_base::out | ios_base::binary);
    if (!fs16.is_open()) {
        cout &lt;&lt; "Could not open utf16.txt" &lt;&lt; endl;
        return 0;
    }
    // Write the utf16 text to the file
    fs16.write(reinterpret_cast&lt;const char*&gt;(&amp;text16[0]), text16.size() * sizeof (unsigned short));
 }
 </pre>
-<p>In the previous code sample, we have seen the use of 3 functions
+<p>In the previous code sample, we have seen the use of the following functions
 from <code>utf8</code> namespace: first we used <code>is_bom</code>
 function to detect UTF-8 byte order mark at the beginning of the
-file, then <code>is_valid</code> to make sure that the text we
+file; then for each line we performed a detection of invalid UTF-8 sequences with <code>find_invalid</code>;
-loaded is valid UTF-8, and finally <code>utf8to16</code> to convert
+the number of characters (more precisely - the number of Unicode code points) in each line was determined
-the text to UTF-16 encoding. Note that the use of
+with a use of <code>utf8::distance</code>; finally, we have converted each line to UTF-16 encoding with 
-<code>is_valid</code> was optional in this case;
+<code>utf8to16</code> and back to UTF-8 with <code>utf16to8</code>.
-<code>utf8to16</code> throws an exception in case of invalid UTF-8
+</p>
 text.</p>
 <h2 id ="reference">Reference</h2>
 <h3>Functions From utf8 Namespace</h3>
 <h4>utf8::append</h4>
 <p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
@ -707,6 +718,7 @@ Consortium</a>.</li>
 <li><a href="http://icu.sourceforge.net/">ICU Library</a>.</li>
 <li><a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8 at
 Wikipedia</a></li>
 <li><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">UTF-8 and Unicode FAQ for Unix/Linux</a></li>
 </ol>
 </body>
 </html>
--- a/samples/docsample.cpp
+++ b/samples/docsample.cpp
@ -0,0 +1,63 @@
 #include "../source/utf8.h"
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <vector>
 using namespace std;
 int main(int argc, char** argv)
 {
    if (argc != 2) {
        cout << "\nUsage: docsample filename\n";
        return 0;
    }
    const char* test_file_path = argv[1];
    // Open the test file (must be UTF-8 encoded)
    ifstream fs8(test_file_path);
    if (!fs8.is_open()) {
    cout << "Could not open " << test_file_path << endl;
    return 0;
    }
    // Read the first line of the file
    unsigned line_count = 1;
    string line;
    if (!getline(fs8, line)) 
        return 0;
    // Look for utf-8 byte-order mark at the beginning
    if (line.size() > 2) {
        if (utf8::is_bom(line.c_str()))
          cout << "There is a byte order mark at the beginning of the file\n";
    }
    // Play with all the lines in the file
    do {
        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
        if (end_it != line.end()) {
            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
        }
        // Get the line length (at least for the valid part)
        int length = utf8::distance(line.begin(), end_it);
        cout << "Length of line " << line_count << " is " << length <<  "\n";
        // Convert it to utf-16
        vector<unsigned short> utf16line;
        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
        // And back to utf-8;
        string utf8line; 
        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
        // Confirm that the conversion went OK:
        if (utf8line != string(line.begin(), end_it))
            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
        getline(fs8, line);
        line_count++;
    } while (!fs8.eof());
    return 0;
 }
--- a/test_drivers/Makefile
+++ b/test_drivers/Makefile
@ -15,3 +15,5 @@ negativetest:
 utf8readertest:
 	cd utf8reader &&  $(MAKE) $@
 clean: 
 	rm smoke_test/smoketest regression_tests/regressiontest negative/negative utf8reader/utf8reader
--- a/test_drivers/negative/Makefile
+++ b/test_drivers/negative/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic
 negativetest: negative.cpp ../../source/utf8.h
 	$(CC) $(CFLAGS) negative.cpp -onegative
--- a/test_drivers/regression_tests/Makefile
+++ b/test_drivers/regression_tests/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic
 REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h
 regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES)
--- a/test_drivers/utf8reader/Makefile
+++ b/test_drivers/utf8reader/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic
 utf8readertest: utf8reader.cpp ../../source/utf8.h
 	$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader