Added a sample app, updated documentation, added clean target to the Makefile, added options Wall and pedantic to test drivers
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@57 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
parent
dab1a9da5a
commit
501d9a21c4
6 changed files with 126 additions and 49 deletions
104
doc/utf8cpp.html
104
doc/utf8cpp.html
|
@ -3,7 +3,9 @@
|
|||
<head>
|
||||
<meta name="generator" content=
|
||||
"HTML Tidy for Linux/x86 (vers 12 April 2005), see www.w3.org">
|
||||
<title></title>
|
||||
<meta name="description" content="A simple, portable and lightweigt C++ library for easy handling of UTF-8 encoded strings">
|
||||
<meta name="keywords" content="UTF-8 C++ portable utf8 unicode generic templates">
|
||||
<title>UTF8-CPP: UTF-8 with C++ in a Portable Way</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="https://sourceforge.net/projects/utfcpp">The Sourceforge project page</a></p>
|
||||
|
@ -42,9 +44,9 @@ want to handle UTF-8 encoded strings from C++, I am sure you have
|
|||
good reasons for it.</p>
|
||||
<h2 id="examples">Examples of use</h2>
|
||||
<p>To illustrate the use of this utf8 library, we shall open a file
|
||||
containing a line of UTF-8 encoded text, read the line into
|
||||
<code>std::string</code>, convert the text to UTF-16, and write it
|
||||
to another file:</p>
|
||||
containing UTF-8 encoded text, check whether it starts with a byte order mark,
|
||||
read each line into a <code>std::string</code>, check it for validity, convert the text to UTF-16,
|
||||
and back to UTF-8:</p>
|
||||
<pre>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
@ -54,60 +56,69 @@ using namespace std;
|
|||
|
||||
int main()
|
||||
{
|
||||
// Open the file with a utf-8 encoded line of text in it
|
||||
ifstream fs8("utf8.txt");
|
||||
if (argc != 2) {
|
||||
cout << "\nUsage: docsample filename\n";
|
||||
return 0;
|
||||
}
|
||||
const char* test_file_path = argv[1];
|
||||
// Open the test file (must be UTF-8 encoded)
|
||||
ifstream fs8(test_file_path);
|
||||
if (!fs8.is_open()) {
|
||||
cout << "Could not open utf8.txt" << endl;
|
||||
cout << "Could not open " << test_file_path << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Read the first line of the file
|
||||
unsigned line_count = 1;
|
||||
string line;
|
||||
if (!getline(fs8, line))
|
||||
return 0;
|
||||
|
||||
// Look for utf-8 byte-order mark at the beginning
|
||||
if (line.size() > 2) {
|
||||
if (utf8::is_bom(line.c_str()))
|
||||
cout << "There is a byte order mark at the beginning of the file\n";
|
||||
}
|
||||
|
||||
// is there a utf8 marker? if yes, skip it.
|
||||
fs8.seekg(0, ios::end);
|
||||
ifstream::pos_type file_length = fs8.tellg();
|
||||
fs8.seekg(0, ios::beg);
|
||||
if (file_length > 3) {
|
||||
char bom[3];
|
||||
fs8.read(bom, 3);
|
||||
if (!utf8::is_bom(bom))
|
||||
fs8.seekg(0, ios::beg);
|
||||
}
|
||||
// Play with all the lines in the file
|
||||
do {
|
||||
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
|
||||
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
|
||||
if (end_it != line.end()) {
|
||||
cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
|
||||
cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
|
||||
}
|
||||
// Get the line length (at least for the valid part)
|
||||
int length = utf8::distance(line.begin(), end_it);
|
||||
cout << "Length of line " << line_count << " is " << length << "\n";
|
||||
|
||||
// Read the line from the file
|
||||
string text8;
|
||||
getline(fs8, text8);
|
||||
// Convert it to utf-16
|
||||
vector<unsigned short> utf16line;
|
||||
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
|
||||
// And back to utf-8;
|
||||
string utf8line;
|
||||
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
|
||||
// Confirm that the conversion went OK:
|
||||
if (utf8line != string(line.begin(), end_it))
|
||||
cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
|
||||
|
||||
// Make sure it is valid utf-8
|
||||
if (!utf8::is_valid(text8.begin(), text8.end())) {
|
||||
cout << "Invalid utf-8 text";
|
||||
return 0;
|
||||
}
|
||||
getline(fs8, line);
|
||||
line_count++;
|
||||
} while (!fs8.eof());
|
||||
|
||||
// Convert the text to utf-16
|
||||
vector<unsigned short> text16;
|
||||
text16.push_back(0xfeff); // bom
|
||||
utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16));
|
||||
|
||||
// Create the file for writing the utf-16 string
|
||||
ofstream fs16("utf16.txt", ios_base::out | ios_base::binary);
|
||||
if (!fs16.is_open()) {
|
||||
cout << "Could not open utf16.txt" << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Write the utf16 text to the file
|
||||
fs16.write(reinterpret_cast<const char*>(&text16[0]), text16.size() * sizeof (unsigned short));
|
||||
return 0;
|
||||
}
|
||||
</pre>
|
||||
<p>In the previous code sample, we have seen the use of 3 functions
|
||||
<p>In the previous code sample, we have seen the use of the following functions
|
||||
from <code>utf8</code> namespace: first we used <code>is_bom</code>
|
||||
function to detect UTF-8 byte order mark at the beginning of the
|
||||
file, then <code>is_valid</code> to make sure that the text we
|
||||
loaded is valid UTF-8, and finally <code>utf8to16</code> to convert
|
||||
the text to UTF-16 encoding. Note that the use of
|
||||
<code>is_valid</code> was optional in this case;
|
||||
<code>utf8to16</code> throws an exception in case of invalid UTF-8
|
||||
text.</p>
|
||||
file; then for each line we performed a detection of invalid UTF-8 sequences with <code>find_invalid</code>;
|
||||
the number of characters (more precisely - the number of Unicode code points) in each line was determined
|
||||
with a use of <code>utf8::distance</code>; finally, we have converted each line to UTF-16 encoding with
|
||||
<code>utf8to16</code> and back to UTF-8 with <code>utf16to8</code>.
|
||||
</p>
|
||||
<h2 id ="reference">Reference</h2>
|
||||
|
||||
<h3>Functions From utf8 Namespace</h3>
|
||||
<h4>utf8::append</h4>
|
||||
<p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
|
||||
|
@ -707,6 +718,7 @@ Consortium</a>.</li>
|
|||
<li><a href="http://icu.sourceforge.net/">ICU Library</a>.</li>
|
||||
<li><a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8 at
|
||||
Wikipedia</a></li>
|
||||
<li><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">UTF-8 and Unicode FAQ for Unix/Linux</a></li>
|
||||
</ol>
|
||||
</body>
|
||||
</html>
|
||||
|
|
63
samples/docsample.cpp
Normal file
63
samples/docsample.cpp
Normal file
|
@ -0,0 +1,63 @@
|
|||
#include "../source/utf8.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
if (argc != 2) {
|
||||
cout << "\nUsage: docsample filename\n";
|
||||
return 0;
|
||||
}
|
||||
const char* test_file_path = argv[1];
|
||||
// Open the test file (must be UTF-8 encoded)
|
||||
ifstream fs8(test_file_path);
|
||||
if (!fs8.is_open()) {
|
||||
cout << "Could not open " << test_file_path << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Read the first line of the file
|
||||
unsigned line_count = 1;
|
||||
string line;
|
||||
if (!getline(fs8, line))
|
||||
return 0;
|
||||
|
||||
// Look for utf-8 byte-order mark at the beginning
|
||||
if (line.size() > 2) {
|
||||
if (utf8::is_bom(line.c_str()))
|
||||
cout << "There is a byte order mark at the beginning of the file\n";
|
||||
}
|
||||
|
||||
// Play with all the lines in the file
|
||||
do {
|
||||
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
|
||||
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
|
||||
if (end_it != line.end()) {
|
||||
cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
|
||||
cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
|
||||
}
|
||||
// Get the line length (at least for the valid part)
|
||||
int length = utf8::distance(line.begin(), end_it);
|
||||
cout << "Length of line " << line_count << " is " << length << "\n";
|
||||
|
||||
// Convert it to utf-16
|
||||
vector<unsigned short> utf16line;
|
||||
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
|
||||
// And back to utf-8;
|
||||
string utf8line;
|
||||
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
|
||||
// Confirm that the conversion went OK:
|
||||
if (utf8line != string(line.begin(), end_it))
|
||||
cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
|
||||
|
||||
getline(fs8, line);
|
||||
line_count++;
|
||||
} while (!fs8.eof());
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -15,3 +15,5 @@ negativetest:
|
|||
utf8readertest:
|
||||
cd utf8reader && $(MAKE) $@
|
||||
|
||||
clean:
|
||||
rm smoke_test/smoketest regression_tests/regressiontest negative/negative utf8reader/utf8reader
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
CC = g++
|
||||
CFLAGS = -g
|
||||
CFLAGS = -g -Wall -pedantic
|
||||
|
||||
negativetest: negative.cpp ../../source/utf8.h
|
||||
$(CC) $(CFLAGS) negative.cpp -onegative
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
CC = g++
|
||||
CFLAGS = -g
|
||||
CFLAGS = -g -Wall -pedantic
|
||||
REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h
|
||||
|
||||
regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
CC = g++
|
||||
CFLAGS = -g
|
||||
CFLAGS = -g -Wall -pedantic
|
||||
|
||||
utf8readertest: utf8reader.cpp ../../source/utf8.h
|
||||
$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader
|
||||
|
|
Loading…
Reference in a new issue