Added a sample app, updated documentation, added clean target to the Makefile, added options Wall and pedantic to test drivers

git-svn-id: http://svn.code.sf.net/p/utfcpp/code@57 a809a056-fc17-0410-9590-b4f493f8b08e
This commit is contained in:
ntrifunovic 2006-09-09 20:39:45 +00:00
parent dab1a9da5a
commit 501d9a21c4
6 changed files with 126 additions and 49 deletions

View file

@ -3,7 +3,9 @@
<head>
<meta name="generator" content=
"HTML Tidy for Linux/x86 (vers 12 April 2005), see www.w3.org">
<title></title>
<meta name="description" content="A simple, portable and lightweigt C++ library for easy handling of UTF-8 encoded strings">
<meta name="keywords" content="UTF-8 C++ portable utf8 unicode generic templates">
<title>UTF8-CPP: UTF-8 with C++ in a Portable Way</title>
</head>
<body>
<p><a href="https://sourceforge.net/projects/utfcpp">The Sourceforge project page</a></p>
@ -42,9 +44,9 @@ want to handle UTF-8 encoded strings from C++, I am sure you have
good reasons for it.</p>
<h2 id="examples">Examples of use</h2>
<p>To illustrate the use of this utf8 library, we shall open a file
containing a line of UTF-8 encoded text, read the line into
<code>std::string</code>, convert the text to UTF-16, and write it
to another file:</p>
containing UTF-8 encoded text, check whether it starts with a byte order mark,
read each line into a <code>std::string</code>, check it for validity, convert the text to UTF-16,
and back to UTF-8:</p>
<pre>
#include &lt;fstream&gt;
#include &lt;iostream&gt;
@ -54,60 +56,69 @@ using namespace std;
int main()
{
// Open the file with a utf-8 encoded line of text in it
ifstream fs8("utf8.txt");
if (argc != 2) {
cout &lt;&lt; "\nUsage: docsample filename\n";
return 0;
}
const char* test_file_path = argv[1];
// Open the test file (must be UTF-8 encoded)
ifstream fs8(test_file_path);
if (!fs8.is_open()) {
cout &lt;&lt; "Could not open utf8.txt" &lt;&lt; endl;
cout &lt;&lt; "Could not open " &lt;&lt; test_file_path &lt;&lt; endl;
return 0;
}
// Read the first line of the file
unsigned line_count = 1;
string line;
if (!getline(fs8, line))
return 0;
// Look for utf-8 byte-order mark at the beginning
if (line.size() &gt; 2) {
if (utf8::is_bom(line.c_str()))
cout &lt;&lt; "There is a byte order mark at the beginning of the file\n";
}
// is there a utf8 marker? if yes, skip it.
fs8.seekg(0, ios::end);
ifstream::pos_type file_length = fs8.tellg();
fs8.seekg(0, ios::beg);
if (file_length &gt; 3) {
char bom[3];
fs8.read(bom, 3);
if (!utf8::is_bom(bom))
fs8.seekg(0, ios::beg);
}
// Play with all the lines in the file
do {
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
if (end_it != line.end()) {
cout &lt;&lt; "Invalid UTF-8 encoding detected at line " &lt;&lt; line_count &lt;&lt; "\n";
cout &lt;&lt; "This part is fine: " &lt;&lt; string(line.begin(), end_it) &lt;&lt; "\n";
}
// Get the line length (at least for the valid part)
int length = utf8::distance(line.begin(), end_it);
cout &lt;&lt; "Length of line " &lt;&lt; line_count &lt;&lt; " is " &lt;&lt; length &lt;&lt; "\n";
// Read the line from the file
string text8;
getline(fs8, text8);
// Convert it to utf-16
vector&lt;unsigned short&gt; utf16line;
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
// And back to utf-8;
string utf8line;
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
// Confirm that the conversion went OK:
if (utf8line != string(line.begin(), end_it))
cout &lt;&lt; "Error in UTF-16 conversion at line: " &lt;&lt; line_count &lt;&lt; "\n";
// Make sure it is valid utf-8
if (!utf8::is_valid(text8.begin(), text8.end())) {
cout &lt;&lt; "Invalid utf-8 text";
return 0;
}
getline(fs8, line);
line_count++;
} while (!fs8.eof());
// Convert the text to utf-16
vector&lt;unsigned short&gt; text16;
text16.push_back(0xfeff); // bom
utf8::utf8to16(text8.begin(), text8.end(), back_inserter(text16));
// Create the file for writing the utf-16 string
ofstream fs16("utf16.txt", ios_base::out | ios_base::binary);
if (!fs16.is_open()) {
cout &lt;&lt; "Could not open utf16.txt" &lt;&lt; endl;
return 0;
}
// Write the utf16 text to the file
fs16.write(reinterpret_cast&lt;const char*&gt;(&amp;text16[0]), text16.size() * sizeof (unsigned short));
return 0;
}
</pre>
<p>In the previous code sample, we have seen the use of 3 functions
<p>In the previous code sample, we have seen the use of the following functions
from <code>utf8</code> namespace: first we used <code>is_bom</code>
function to detect UTF-8 byte order mark at the beginning of the
file, then <code>is_valid</code> to make sure that the text we
loaded is valid UTF-8, and finally <code>utf8to16</code> to convert
the text to UTF-16 encoding. Note that the use of
<code>is_valid</code> was optional in this case;
<code>utf8to16</code> throws an exception in case of invalid UTF-8
text.</p>
file; then for each line we performed a detection of invalid UTF-8 sequences with <code>find_invalid</code>;
the number of characters (more precisely - the number of Unicode code points) in each line was determined
with a use of <code>utf8::distance</code>; finally, we have converted each line to UTF-16 encoding with
<code>utf8to16</code> and back to UTF-8 with <code>utf16to8</code>.
</p>
<h2 id ="reference">Reference</h2>
<h3>Functions From utf8 Namespace</h3>
<h4>utf8::append</h4>
<p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
@ -707,6 +718,7 @@ Consortium</a>.</li>
<li><a href="http://icu.sourceforge.net/">ICU Library</a>.</li>
<li><a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8 at
Wikipedia</a></li>
<li><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">UTF-8 and Unicode FAQ for Unix/Linux</a></li>
</ol>
</body>
</html>

63
samples/docsample.cpp Normal file
View file

@ -0,0 +1,63 @@
#include "../source/utf8.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
using namespace std;
int main(int argc, char** argv)
{
if (argc != 2) {
cout << "\nUsage: docsample filename\n";
return 0;
}
const char* test_file_path = argv[1];
// Open the test file (must be UTF-8 encoded)
ifstream fs8(test_file_path);
if (!fs8.is_open()) {
cout << "Could not open " << test_file_path << endl;
return 0;
}
// Read the first line of the file
unsigned line_count = 1;
string line;
if (!getline(fs8, line))
return 0;
// Look for utf-8 byte-order mark at the beginning
if (line.size() > 2) {
if (utf8::is_bom(line.c_str()))
cout << "There is a byte order mark at the beginning of the file\n";
}
// Play with all the lines in the file
do {
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
if (end_it != line.end()) {
cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
}
// Get the line length (at least for the valid part)
int length = utf8::distance(line.begin(), end_it);
cout << "Length of line " << line_count << " is " << length << "\n";
// Convert it to utf-16
vector<unsigned short> utf16line;
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
// And back to utf-8;
string utf8line;
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
// Confirm that the conversion went OK:
if (utf8line != string(line.begin(), end_it))
cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
getline(fs8, line);
line_count++;
} while (!fs8.eof());
return 0;
}

View file

@ -15,3 +15,5 @@ negativetest:
utf8readertest:
cd utf8reader && $(MAKE) $@
clean:
rm smoke_test/smoketest regression_tests/regressiontest negative/negative utf8reader/utf8reader

View file

@ -1,5 +1,5 @@
CC = g++
CFLAGS = -g
CFLAGS = -g -Wall -pedantic
negativetest: negative.cpp ../../source/utf8.h
$(CC) $(CFLAGS) negative.cpp -onegative

View file

@ -1,5 +1,5 @@
CC = g++
CFLAGS = -g
CFLAGS = -g -Wall -pedantic
REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h
regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES)

View file

@ -1,5 +1,5 @@
CC = g++
CFLAGS = -g
CFLAGS = -g -Wall -pedantic
utf8readertest: utf8reader.cpp ../../source/utf8.h
$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader