From 501d9a21c42b1991411c690ae63a0d84f7b9621f Mon Sep 17 00:00:00 2001
From: ntrifunovic
To illustrate the use of this utf8 library, we shall open a file
-containing a line of UTF-8 encoded text, read the line into
-std::string
, convert the text to UTF-16, and write it
-to another file:
std::string
, check it for validity, convert the text to UTF-16,
+and back to UTF-8:
#include <fstream> #include <iostream> @@ -54,60 +56,69 @@ using namespace std; int main() { - // Open the file with a utf-8 encoded line of text in it - ifstream fs8("utf8.txt"); + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; + } + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) + ifstream fs8(test_file_path); if (!fs8.is_open()) { - cout << "Could not open utf8.txt" << endl; + cout << "Could not open " << test_file_path << endl; + return 0; + } + + // Read the first line of the file + unsigned line_count = 1; + string line; + if (!getline(fs8, line)) return 0; + + // Look for utf-8 byte-order mark at the beginning + if (line.size() > 2) { + if (utf8::is_bom(line.c_str())) + cout << "There is a byte order mark at the beginning of the file\n"; } - // is there a utf8 marker? if yes, skip it. - fs8.seekg(0, ios::end); - ifstream::pos_type file_length = fs8.tellg(); - fs8.seekg(0, ios::beg); - if (file_length > 3) { - char bom[3]; - fs8.read(bom, 3); - if (!utf8::is_bom(bom)) - fs8.seekg(0, ios::beg); - } + // Play with all the lines in the file + do { + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) + string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + } + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; - // Read the line from the file - string text8; - getline(fs8, text8); + // Convert it to utf-16 + vector<unsigned short> utf16line; + utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); + // And back to utf-8; + string utf8line; + utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; - // Make sure it is valid utf-8 - if (!utf8::is_valid(text8.begin(), text8.end())) { - cout << "Invalid utf-8 text"; - return 0; - } + getline(fs8, line); + line_count++; + } while (!fs8.eof()); - // Convert the text to utf-16 - vector<unsigned short> text16; - text16.push_back(0xfeff); // bom - utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16)); - - // Create the file for writing the utf-16 string - ofstream fs16("utf16.txt", ios_base::out | ios_base::binary); - if (!fs16.is_open()) { - cout << "Could not open utf16.txt" << endl; - return 0; - } - - // Write the utf16 text to the file - fs16.write(reinterpret_cast<const char*>(&text16[0]), text16.size() * sizeof (unsigned short)); + return 0; }-
In the previous code sample, we have seen the use of 3 functions +
In the previous code sample, we have seen the use of the following functions
from utf8
namespace: first we used is_bom
function to detect UTF-8 byte order mark at the beginning of the
-file, then is_valid
to make sure that the text we
-loaded is valid UTF-8, and finally utf8to16
to convert
-the text to UTF-16 encoding. Note that the use of
-is_valid
was optional in this case;
-utf8to16
throws an exception in case of invalid UTF-8
-text.
find_invalid
;
+the number of characters (more precisely - the number of Unicode code points) in each line was determined
+with a use of utf8::distance
; finally, we have converted each line to UTF-16 encoding with
+utf8to16
and back to UTF-8 with utf16to8
.
+
Encodes a 32 bit code point as a UTF-8 sequence of octets and @@ -707,6 +718,7 @@ Consortium.