Added a sample app, updated documentation, added clean target to the Makefile, added options Wall and pedantic to test drivers

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@57 a809a056-fc17-0410-9590-b4f493f8b08e
2006-09-09 20:39:45 +00:00 · 2006-09-09 20:39:45 +00:00 · 501d9a21c4
commit 501d9a21c4
parent dab1a9da5a
6 changed files with 126 additions and 49 deletions
--- a/doc/utf8cpp.html
+++ b/doc/utf8cpp.html
@ -3,7 +3,9 @@
 <head>
 <meta name="generator" content=
 "HTML Tidy for Linux/x86 (vers 12 April 2005), see www.w3.org">
-<title></title>
+<meta name="description" content="A simple, portable and lightweigt C++ library for easy handling of UTF-8 encoded strings">
+<meta name="keywords" content="UTF-8 C++ portable utf8 unicode generic templates">
+<title>UTF8-CPP: UTF-8 with C++ in a Portable Way</title>
 </head>
 <body>
 <p><a href="https://sourceforge.net/projects/utfcpp">The Sourceforge project page</a></p>
@ -42,9 +44,9 @@ want to handle UTF-8 encoded strings from C++, I am sure you have
 good reasons for it.</p>
 <h2 id="examples">Examples of use</h2>
 <p>To illustrate the use of this utf8 library, we shall open a file
-containing a line of UTF-8 encoded text, read the line into
-<code>std::string</code>, convert the text to UTF-16, and write it
-to another file:</p>
+containing UTF-8 encoded text, check whether it starts with a byte order mark,
+read each line into a <code>std::string</code>, check it for validity, convert the text to UTF-16,
+and back to UTF-8:</p>
 <pre>
 #include &lt;fstream&gt;
 #include &lt;iostream&gt;
@ -54,60 +56,69 @@ using namespace std;

 int main()
 {
-    // Open the file with a utf-8 encoded line of text in it
-    ifstream fs8("utf8.txt");
+    if (argc != 2) {
+        cout &lt;&lt; "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
+    ifstream fs8(test_file_path);
    if (!fs8.is_open()) {
-        cout &lt;&lt; "Could not open utf8.txt" &lt;&lt; endl;
+    cout &lt;&lt; "Could not open " &lt;&lt; test_file_path &lt;&lt; endl;
+    return 0;
+    }
+
+    // Read the first line of the file
+    unsigned line_count = 1;
+    string line;
+    if (!getline(fs8, line)) 
        return 0;
+
+    // Look for utf-8 byte-order mark at the beginning
+    if (line.size() &gt; 2) {
+        if (utf8::is_bom(line.c_str()))
+          cout &lt;&lt; "There is a byte order mark at the beginning of the file\n";
    }

-    // is there a utf8 marker? if yes, skip it.
-    fs8.seekg(0, ios::end);
-    ifstream::pos_type file_length = fs8.tellg();
-    fs8.seekg(0, ios::beg);
-    if (file_length &gt; 3) {
-        char bom[3];
-        fs8.read(bom, 3);
-        if (!utf8::is_bom(bom))
-            fs8.seekg(0, ios::beg);        
-    }
+    // Play with all the lines in the file
+    do {
+        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+        if (end_it != line.end()) {
+            cout &lt;&lt; "Invalid UTF-8 encoding detected at line " &lt;&lt; line_count &lt;&lt; "\n";
+            cout &lt;&lt; "This part is fine: " &lt;&lt; string(line.begin(), end_it) &lt;&lt; "\n";
+        }
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout &lt;&lt; "Length of line " &lt;&lt; line_count &lt;&lt; " is " &lt;&lt; length &lt;&lt;  "\n";

-    // Read the line from the file
-    string text8;
-    getline(fs8, text8);
+        // Convert it to utf-16
+        vector&lt;unsigned short&gt; utf16line;
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+        // And back to utf-8;
+        string utf8line; 
+        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout &lt;&lt; "Error in UTF-16 conversion at line: " &lt;&lt; line_count &lt;&lt; "\n";        

-    // Make sure it is valid utf-8
-    if (!utf8::is_valid(text8.begin(), text8.end())) {
-        cout &lt;&lt; "Invalid utf-8 text";
-        return 0;
-    }
+        getline(fs8, line);
+        line_count++;
+    } while (!fs8.eof());

-    // Convert the text to utf-16
-    vector&lt;unsigned short&gt; text16;
-    text16.push_back(0xfeff); // bom
-    utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16));
-
-    // Create  the file for writing the utf-16 string
-    ofstream fs16("utf16.txt", ios_base::out | ios_base::binary);
-    if (!fs16.is_open()) {
-        cout &lt;&lt; "Could not open utf16.txt" &lt;&lt; endl;
-        return 0;
-    }
-    
-    // Write the utf16 text to the file
-    fs16.write(reinterpret_cast&lt;const char*&gt;(&amp;text16[0]), text16.size() * sizeof (unsigned short));
+    return 0;
 }
 </pre>
-<p>In the previous code sample, we have seen the use of 3 functions
+<p>In the previous code sample, we have seen the use of the following functions
 from <code>utf8</code> namespace: first we used <code>is_bom</code>
 function to detect UTF-8 byte order mark at the beginning of the
-file, then <code>is_valid</code> to make sure that the text we
-loaded is valid UTF-8, and finally <code>utf8to16</code> to convert
-the text to UTF-16 encoding. Note that the use of
-<code>is_valid</code> was optional in this case;
-<code>utf8to16</code> throws an exception in case of invalid UTF-8
-text.</p>
+file; then for each line we performed a detection of invalid UTF-8 sequences with <code>find_invalid</code>;
+the number of characters (more precisely - the number of Unicode code points) in each line was determined
+with a use of <code>utf8::distance</code>; finally, we have converted each line to UTF-16 encoding with 
+<code>utf8to16</code> and back to UTF-8 with <code>utf16to8</code>.
+</p>
 <h2 id ="reference">Reference</h2>
+
 <h3>Functions From utf8 Namespace</h3>
 <h4>utf8::append</h4>
 <p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
@ -707,6 +718,7 @@ Consortium</a>.</li>
 <li><a href="http://icu.sourceforge.net/">ICU Library</a>.</li>
 <li><a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8 at
 Wikipedia</a></li>
+<li><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">UTF-8 and Unicode FAQ for Unix/Linux</a></li>
 </ol>
 </body>
 </html>
--- a/samples/docsample.cpp
+++ b/samples/docsample.cpp
@ -0,0 +1,63 @@
+#include "../source/utf8.h"
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
+    ifstream fs8(test_file_path);
+    if (!fs8.is_open()) {
+    cout << "Could not open " << test_file_path << endl;
+    return 0;
+    }
+
+    // Read the first line of the file
+    unsigned line_count = 1;
+    string line;
+    if (!getline(fs8, line)) 
+        return 0;
+
+    // Look for utf-8 byte-order mark at the beginning
+    if (line.size() > 2) {
+        if (utf8::is_bom(line.c_str()))
+          cout << "There is a byte order mark at the beginning of the file\n";
+    }
+
+    // Play with all the lines in the file
+    do {
+        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+        if (end_it != line.end()) {
+            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+        }
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout << "Length of line " << line_count << " is " << length <<  "\n";
+
+        // Convert it to utf-16
+        vector<unsigned short> utf16line;
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+        // And back to utf-8;
+        string utf8line; 
+        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
+
+        getline(fs8, line);
+        line_count++;
+    } while (!fs8.eof());
+
+    return 0;
+}
--- a/test_drivers/Makefile
+++ b/test_drivers/Makefile
@ -15,3 +15,5 @@ negativetest:
 utf8readertest:
 	cd utf8reader &&  $(MAKE) $@

+clean: 
+	rm smoke_test/smoketest regression_tests/regressiontest negative/negative utf8reader/utf8reader
--- a/test_drivers/negative/Makefile
+++ b/test_drivers/negative/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic

 negativetest: negative.cpp ../../source/utf8.h
 	$(CC) $(CFLAGS) negative.cpp -onegative
--- a/test_drivers/regression_tests/Makefile
+++ b/test_drivers/regression_tests/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic
 REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h

 regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES)
--- a/test_drivers/utf8reader/Makefile
+++ b/test_drivers/utf8reader/Makefile
@ -1,5 +1,5 @@
 CC = g++
-CFLAGS = -g
+CFLAGS = -g -Wall -pedantic

 utf8readertest: utf8reader.cpp ../../source/utf8.h
 	$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader