From 501d9a21c42b1991411c690ae63a0d84f7b9621f Mon Sep 17 00:00:00 2001 From: ntrifunovic Date: Sat, 9 Sep 2006 20:39:45 +0000 Subject: [PATCH] Added a sample app, updated documentation, added clean target to the Makefile, added options Wall and pedantic to test drivers git-svn-id: http://svn.code.sf.net/p/utfcpp/code@57 a809a056-fc17-0410-9590-b4f493f8b08e --- doc/utf8cpp.html | 104 ++++++++++++++----------- samples/docsample.cpp | 63 +++++++++++++++ test_drivers/Makefile | 2 + test_drivers/negative/Makefile | 2 +- test_drivers/regression_tests/Makefile | 2 +- test_drivers/utf8reader/Makefile | 2 +- 6 files changed, 126 insertions(+), 49 deletions(-) create mode 100644 samples/docsample.cpp diff --git a/doc/utf8cpp.html b/doc/utf8cpp.html index e07dc73..d97edf4 100644 --- a/doc/utf8cpp.html +++ b/doc/utf8cpp.html @@ -3,7 +3,9 @@ - + + +UTF8-CPP: UTF-8 with C++ in a Portable Way

The Sourceforge project page

@@ -42,9 +44,9 @@ want to handle UTF-8 encoded strings from C++, I am sure you have good reasons for it.

Examples of use

To illustrate the use of this utf8 library, we shall open a file -containing a line of UTF-8 encoded text, read the line into -std::string, convert the text to UTF-16, and write it -to another file:

+containing UTF-8 encoded text, check whether it starts with a byte order mark, +read each line into a std::string, check it for validity, convert the text to UTF-16, +and back to UTF-8:

 #include <fstream>
 #include <iostream>
@@ -54,60 +56,69 @@ using namespace std;
 
 int main()
 {
-    // Open the file with a utf-8 encoded line of text in it
-    ifstream fs8("utf8.txt");
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
+    ifstream fs8(test_file_path);
     if (!fs8.is_open()) {
-        cout << "Could not open utf8.txt" << endl;
+    cout << "Could not open " << test_file_path << endl;
+    return 0;
+    }
+
+    // Read the first line of the file
+    unsigned line_count = 1;
+    string line;
+    if (!getline(fs8, line)) 
         return 0;
+
+    // Look for utf-8 byte-order mark at the beginning
+    if (line.size() > 2) {
+        if (utf8::is_bom(line.c_str()))
+          cout << "There is a byte order mark at the beginning of the file\n";
     }
 
-    // is there a utf8 marker? if yes, skip it.
-    fs8.seekg(0, ios::end);
-    ifstream::pos_type file_length = fs8.tellg();
-    fs8.seekg(0, ios::beg);
-    if (file_length > 3) {
-        char bom[3];
-        fs8.read(bom, 3);
-        if (!utf8::is_bom(bom))
-            fs8.seekg(0, ios::beg);        
-    }
+    // Play with all the lines in the file
+    do {
+        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+        if (end_it != line.end()) {
+            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+        }
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout << "Length of line " << line_count << " is " << length <<  "\n";
 
-    // Read the line from the file
-    string text8;
-    getline(fs8, text8);
+        // Convert it to utf-16
+        vector<unsigned short> utf16line;
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+        // And back to utf-8;
+        string utf8line; 
+        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
 
-    // Make sure it is valid utf-8
-    if (!utf8::is_valid(text8.begin(), text8.end())) {
-        cout << "Invalid utf-8 text";
-        return 0;
-    }
+        getline(fs8, line);
+        line_count++;
+    } while (!fs8.eof());
 
-    // Convert the text to utf-16
-    vector<unsigned short> text16;
-    text16.push_back(0xfeff); // bom
-    utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16));
-
-    // Create  the file for writing the utf-16 string
-    ofstream fs16("utf16.txt", ios_base::out | ios_base::binary);
-    if (!fs16.is_open()) {
-        cout << "Could not open utf16.txt" << endl;
-        return 0;
-    }
-    
-    // Write the utf16 text to the file
-    fs16.write(reinterpret_cast<const char*>(&text16[0]), text16.size() * sizeof (unsigned short));
+    return 0;
 }
 
-

In the previous code sample, we have seen the use of 3 functions +

In the previous code sample, we have seen the use of the following functions from utf8 namespace: first we used is_bom function to detect UTF-8 byte order mark at the beginning of the -file, then is_valid to make sure that the text we -loaded is valid UTF-8, and finally utf8to16 to convert -the text to UTF-16 encoding. Note that the use of -is_valid was optional in this case; -utf8to16 throws an exception in case of invalid UTF-8 -text.

+file; then for each line we performed a detection of invalid UTF-8 sequences with find_invalid; +the number of characters (more precisely - the number of Unicode code points) in each line was determined +with a use of utf8::distance; finally, we have converted each line to UTF-16 encoding with +utf8to16 and back to UTF-8 with utf16to8. +

Reference

+

Functions From utf8 Namespace

utf8::append

Encodes a 32 bit code point as a UTF-8 sequence of octets and @@ -707,6 +718,7 @@ Consortium.

  • ICU Library.
  • UTF-8 at Wikipedia
  • +
  • UTF-8 and Unicode FAQ for Unix/Linux
  • diff --git a/samples/docsample.cpp b/samples/docsample.cpp new file mode 100644 index 0000000..fc8f6f3 --- /dev/null +++ b/samples/docsample.cpp @@ -0,0 +1,63 @@ +#include "../source/utf8.h" +#include +#include +#include +#include + + +using namespace std; + +int main(int argc, char** argv) +{ + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; + } + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) + ifstream fs8(test_file_path); + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; + } + + // Read the first line of the file + unsigned line_count = 1; + string line; + if (!getline(fs8, line)) + return 0; + + // Look for utf-8 byte-order mark at the beginning + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; + } + + // Play with all the lines in the file + do { + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) + string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + } + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + + // Convert it to utf-16 + vector utf16line; + utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); + // And back to utf-8; + string utf8line; + utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + + getline(fs8, line); + line_count++; + } while (!fs8.eof()); + + return 0; +} diff --git a/test_drivers/Makefile b/test_drivers/Makefile index a1d1c2c..a2cbe4c 100644 --- a/test_drivers/Makefile +++ b/test_drivers/Makefile @@ -15,3 +15,5 @@ negativetest: utf8readertest: cd utf8reader && $(MAKE) $@ +clean: + rm smoke_test/smoketest regression_tests/regressiontest negative/negative utf8reader/utf8reader diff --git a/test_drivers/negative/Makefile b/test_drivers/negative/Makefile index d1f6e27..02159f7 100644 --- a/test_drivers/negative/Makefile +++ b/test_drivers/negative/Makefile @@ -1,5 +1,5 @@ CC = g++ -CFLAGS = -g +CFLAGS = -g -Wall -pedantic negativetest: negative.cpp ../../source/utf8.h $(CC) $(CFLAGS) negative.cpp -onegative diff --git a/test_drivers/regression_tests/Makefile b/test_drivers/regression_tests/Makefile index b9a0b85..03f8cfe 100644 --- a/test_drivers/regression_tests/Makefile +++ b/test_drivers/regression_tests/Makefile @@ -1,5 +1,5 @@ CC = g++ -CFLAGS = -g +CFLAGS = -g -Wall -pedantic REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES) diff --git a/test_drivers/utf8reader/Makefile b/test_drivers/utf8reader/Makefile index 74b6c07..e46435c 100644 --- a/test_drivers/utf8reader/Makefile +++ b/test_drivers/utf8reader/Makefile @@ -1,5 +1,5 @@ CC = g++ -CFLAGS = -g +CFLAGS = -g -Wall -pedantic utf8readertest: utf8reader.cpp ../../source/utf8.h $(CC) $(CFLAGS) utf8reader.cpp -o utf8reader