Compare commits

...

84 commits

Author SHA1 Message Date
ntrifunovic
38a187a7dd Adding the LICENSE file
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@142 a809a056-fc17-0410-9590-b4f493f8b08e
2014-05-15 01:23:53 +00:00
ntrifunovic
cc3c158bf8 Introducing Boost Test for unit-testing v3.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@141 a809a056-fc17-0410-9590-b4f493f8b08e
2013-03-09 20:51:50 +00:00
ntrifunovic
7075404ff0 First check in for branch 3.x - playing with utf8::append
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@140 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-24 03:06:50 +00:00
ntrifunovic
fa73898a3d Removing version 1_0 directory
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@139 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-18 00:05:43 +00:00
ntrifunovic
62b7d7ae0c Release 2.3.4
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@138 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-17 22:40:46 +00:00
ntrifunovic
596feae4b9 Release 2.3.3
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@137 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-16 16:30:43 +00:00
ntrifunovic
129a2f4508 Fix for bug ID: 3576827 - replace_invalid() only works with back_inserter
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@136 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-09 23:33:27 +00:00
ntrifunovic
7767eb67e8 Fixing a potential problem with utf8 to utf16/32 conversions
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@135 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-09 22:12:53 +00:00
ntrifunovic
d569ff9c55 Fixing a smoke-test warning
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@134 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-09 21:55:13 +00:00
ntrifunovic
7d589c4210 Fix for bug ID: 3602629 - extra ';' after member function definition
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@133 a809a056-fc17-0410-9590-b4f493f8b08e
2013-02-09 21:50:09 +00:00
ntrifunovic
100dd38c70 Release 2.3.2
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@132 a809a056-fc17-0410-9590-b4f493f8b08e
2012-05-26 23:56:44 +00:00
ntrifunovic
4720a99866 Removing a regression test for a reported bug that I am not fixing.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@131 a809a056-fc17-0410-9590-b4f493f8b08e
2012-05-26 17:17:43 +00:00
ntrifunovic
adb7687b2f Fix for the bug 3506114: potential crash in replace_invalid
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@130 a809a056-fc17-0410-9590-b4f493f8b08e
2012-05-22 22:55:47 +00:00
ntrifunovic
cd80d5fa9e Changed validate_next to take a reference instead of pointer. Resulted in 5% performance improvement.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@129 a809a056-fc17-0410-9590-b4f493f8b08e
2011-11-12 17:12:34 +00:00
ntrifunovic
a1eaf5688a Adding a regression test
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@128 a809a056-fc17-0410-9590-b4f493f8b08e
2011-11-06 16:13:52 +00:00
ntrifunovic
e464ef8e86 Fix for the bug ID: 3426789[guidline -> guideline]
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@127 a809a056-fc17-0410-9590-b4f493f8b08e
2011-10-29 22:26:12 +00:00
ntrifunovic
93286b9390 Removed some superfluous code
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@126 a809a056-fc17-0410-9590-b4f493f8b08e
2011-10-16 03:06:05 +00:00
ntrifunovic
7414d0fabf Changed the optimization option from O2 to O3 for the perf test
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@125 a809a056-fc17-0410-9590-b4f493f8b08e
2011-10-16 01:16:51 +00:00
ntrifunovic
26d8c8e424 Refactored internal functions in core.h
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@124 a809a056-fc17-0410-9590-b4f493f8b08e
2011-10-15 22:54:58 +00:00
ntrifunovic
36839ac4e7 Fixed iconv perf test to print out results
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@123 a809a056-fc17-0410-9590-b4f493f8b08e
2011-06-24 23:31:24 +00:00
ntrifunovic
9d7a97089c Fix for the bug [name clash with std::next - ID: 3215839]
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@122 a809a056-fc17-0410-9590-b4f493f8b08e
2011-06-24 23:21:41 +00:00
ntrifunovic
1c3b1a352e Fixed the negative test to reflect the latest changes in detecting invalid utf-8 text
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@121 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-20 21:02:33 +00:00
ntrifunovic
26b3524f45 Removed redundant regression tests
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@120 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-20 20:49:41 +00:00
ntrifunovic
5d8b75cd6b Release 2.3.1
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@119 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-20 18:52:44 +00:00
ntrifunovic
5347b21b56 Fix for ID: 3185087 - utf8::prior and utf8::previous documentation issue
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@118 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-20 18:33:36 +00:00
ntrifunovic
a4fce3befd Fix for the bug ID: 3083640 - is_code_point_valid incorrectly returns false
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@117 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-20 18:07:59 +00:00
ntrifunovic
2976b72daa Fix for [3167987]: prior moves it before start
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@116 a809a056-fc17-0410-9590-b4f493f8b08e
2011-02-15 01:18:49 +00:00
ntrifunovic
cc4fe49fdc Minor improvements to performance testing code
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@115 a809a056-fc17-0410-9590-b4f493f8b08e
2010-09-04 16:10:35 +00:00
ntrifunovic
05e6c4ad8d Fix for the bug ID: 3025042: is_bom documentation issue
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@114 a809a056-fc17-0410-9590-b4f493f8b08e
2010-09-04 15:47:12 +00:00
ntrifunovic
14acee1ec5 Release 2.3
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@113 a809a056-fc17-0410-9590-b4f493f8b08e
2010-04-18 00:29:14 +00:00
ntrifunovic
8039bd481b Completed documentation for the exceptions. Fixed bug ID: 2960112: is_bom wording fix
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@112 a809a056-fc17-0410-9590-b4f493f8b08e
2010-04-17 17:09:40 +00:00
ntrifunovic
656f3847e8 Feature request 2857462: Proposed minor extension: safe version of is_bom
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@111 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-20 22:46:01 +00:00
ntrifunovic
ac756dc9d6 Fix for the bug ID: 2915657 - 64bit portability issue
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@110 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-20 22:03:47 +00:00
ntrifunovic
0f2c72abf1 Removng the boost directory. Its purpose was to prepare a version of UTF8 CPP for submition to Boost. This plan does not seem feasible.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@109 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-20 21:51:15 +00:00
ntrifunovic
59e75aa511 Feature ID 2885695: "Group" utf8 exceptions.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@108 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-13 19:39:22 +00:00
ntrifunovic
baf711282e Fix for the bug [ID: 2906315]: < instead != in utf8to32
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@107 a809a056-fc17-0410-9590-b4f493f8b08e
2009-12-07 01:34:23 +00:00
ntrifunovic
301bd94165 Release 2.2.4
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@106 a809a056-fc17-0410-9590-b4f493f8b08e
2009-10-31 16:14:51 +00:00
ntrifunovic
a415a2f081 Fix for the bug ID: 2857454 [dereference invalid iterator]
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@105 a809a056-fc17-0410-9590-b4f493f8b08e
2009-10-29 01:18:27 +00:00
ntrifunovic
d97ccb32f7 Release 2.2.3
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@104 a809a056-fc17-0410-9590-b4f493f8b08e
2009-10-12 23:03:44 +00:00
ntrifunovic
ba4b4c1e83 Fixing regression test id_2857454
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@103 a809a056-fc17-0410-9590-b4f493f8b08e
2009-10-12 22:54:15 +00:00
ntrifunovic
da0c8b96d9 Fix for bug #ID: 2857456[redundant checks in append in checked.h]
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@102 a809a056-fc17-0410-9590-b4f493f8b08e
2009-10-04 18:17:22 +00:00
ntrifunovic
080865eb02 Added regression test for [ 2857454 ] dereference invalid iterator when lead surrogate was last element of the string
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@101 a809a056-fc17-0410-9590-b4f493f8b08e
2009-09-27 18:47:45 +00:00
ntrifunovic
f37a772149 Fix for bug ID: 2852872 [invalid utf16 strings were parsed without any error]
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@100 a809a056-fc17-0410-9590-b4f493f8b08e
2009-09-26 01:41:24 +00:00
ntrifunovic
6c3aa1f33e Added a regression test to detect a sequence of multiple trail surrogate code units
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@99 a809a056-fc17-0410-9590-b4f493f8b08e
2009-09-26 01:13:26 +00:00
ntrifunovic
06cc5cf480 Fix for the bug ID: 2830326: " multiple definition of `utf8::internal::is_overlong_sequence"
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@98 a809a056-fc17-0410-9590-b4f493f8b08e
2009-08-01 01:50:13 +00:00
ntrifunovic
3c9c379857 Release 2.2.1
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@97 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-28 00:51:51 +00:00
ntrifunovic
6c7224f4f2 Fixing the test drivers to work with GCC 4.3
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@96 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-28 00:40:12 +00:00
ntrifunovic
169bfe469c Fix for the bug ID: 2823847: warnings from GCC 4.3
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@95 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-28 00:31:03 +00:00
ntrifunovic
f344a3fb4d Release 2.2
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@94 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-07 00:47:54 +00:00
ntrifunovic
054defb568 Another update of documentation
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@93 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-07 00:46:34 +00:00
ntrifunovic
9d935b3c69 Updated documentation to include additional samples. Fixed a typo in core.h
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@92 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-05 21:14:40 +00:00
ntrifunovic
e2799bdab6 Removed std::distance from validate_next and (hopefully) made it work with input iterators. Also, did a major
refactoring of that function.


git-svn-id: http://svn.code.sf.net/p/utfcpp/code@91 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-05 00:09:18 +00:00
ntrifunovic
74be521392 Updated the documentation to have a better intro sample
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@90 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-03 19:40:14 +00:00
ntrifunovic
40a955eef6 Updated the docsample sample
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@89 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-03 15:28:31 +00:00
ntrifunovic
4df5e1c1ea Fixed the negative test and made the input file name the comand line argument
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@88 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-02 23:56:52 +00:00
ntrifunovic
5748eeff08 Removing bidirectional restrictions for the octet_iterator
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@87 a809a056-fc17-0410-9590-b4f493f8b08e
2009-07-01 11:55:37 +00:00
ntrifunovic
dacd49dde9 Fixed the signature for main in the sample
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@86 a809a056-fc17-0410-9590-b4f493f8b08e
2009-05-17 19:21:31 +00:00
ntrifunovic
76c6662ef9 Implemented feature request ID: 2515238; renamed an internal enum to avoid conflicts with macros from other libraries
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@85 a809a056-fc17-0410-9590-b4f493f8b08e
2009-03-14 17:19:41 +00:00
ntrifunovic
c92c41770d Release 2.1
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@84 a809a056-fc17-0410-9590-b4f493f8b08e
2007-12-16 18:52:45 +00:00
ntrifunovic
7568388d19 Updated the documentation and a test to include peek_next()
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@83 a809a056-fc17-0410-9590-b4f493f8b08e
2007-10-27 23:34:59 +00:00
ntrifunovic
d2081b8381 Added peek_next
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@82 a809a056-fc17-0410-9590-b4f493f8b08e
2007-10-25 22:12:22 +00:00
ntrifunovic
193c1032c2 Deleted the 2_1 branch. Decided against added the utf-8 string type, at least for now.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@81 a809a056-fc17-0410-9590-b4f493f8b08e
2007-10-24 23:06:53 +00:00
ntrifunovic
f6668b3189 Version 2.1 branch
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@80 a809a056-fc17-0410-9590-b4f493f8b08e
2007-04-06 13:32:27 +00:00
ntrifunovic
f58bf21527 Release 2.0
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@79 a809a056-fc17-0410-9590-b4f493f8b08e
2007-02-25 00:26:48 +00:00
ntrifunovic
baf63b327a Updated documentation. Fixed a small bug in checked.h. Added new checks to the negative tests
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@78 a809a056-fc17-0410-9590-b4f493f8b08e
2007-02-25 00:16:10 +00:00
ntrifunovic
cd3092c0ca A minor documentation fix
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@77 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 02:20:56 +00:00
ntrifunovic
b4f5578f4d Updated buildrelease.pl for the 2.0 source code structure
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@76 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 02:18:30 +00:00
ntrifunovic
fe0be22e75 Release 2.0 Beta 1
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@75 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 01:52:36 +00:00
ntrifunovic
3df044a663 Added documentation for the iterator adapter
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@74 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 01:52:13 +00:00
ntrifunovic
83b6f918a9 Updated makefiles to reflect the new source structure
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@73 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 01:50:44 +00:00
ntrifunovic
e022e54c64 The requirement for octet_iterator is bidirectional rather than random access now. Other minor changes
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@72 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-18 01:49:58 +00:00
ntrifunovic
77c267b49e Added boost directory, updated documentation, fixed a typo in a local variable in the code
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@71 a809a056-fc17-0410-9590-b4f493f8b08e
2006-12-07 02:34:42 +00:00
ntrifunovic
6f08efdc90 Added unchecked::previous to the 1.x branch
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@70 a809a056-fc17-0410-9590-b4f493f8b08e
2006-11-23 18:11:24 +00:00
ntrifunovic
fb13348356 Added the unchecked iterator, added base() to the checked one, updated tests
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@69 a809a056-fc17-0410-9590-b4f493f8b08e
2006-11-23 18:10:26 +00:00
ntrifunovic
8da1b779ac Release notes for v1.02
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@68 a809a056-fc17-0410-9590-b4f493f8b08e
2006-11-20 20:05:44 +00:00
ntrifunovic
c7fd119bec deprecated previous and introduced prior instead
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@67 a809a056-fc17-0410-9590-b4f493f8b08e
2006-11-19 01:15:37 +00:00
ntrifunovic
e4dc80dae3 Added the checked iterator, function prior that replaces previous, and updated the html documentation
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@66 a809a056-fc17-0410-9590-b4f493f8b08e
2006-11-04 01:28:38 +00:00
ntrifunovic
d2ee7164b6 Added the first version of the iterator to the code. Started upgrading the html documentation
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@65 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-28 16:25:52 +00:00
ntrifunovic
24f4090afa Release 1.01
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@64 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-24 12:51:05 +00:00
ntrifunovic
f90dc28c5b Fix for the bug 1583547: exception::what is a const member function
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@63 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-24 12:41:15 +00:00
ntrifunovic
70bf3379df split the library into multiple .h files
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@62 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-20 22:37:59 +00:00
ntrifunovic
f0fce39119 Implemented replace_invalid functionality
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@61 a809a056-fc17-0410-9590-b4f493f8b08e
2006-10-07 21:25:47 +00:00
ntrifunovic
8af502d493 Version 2.x of the library
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@60 a809a056-fc17-0410-9590-b4f493f8b08e
2006-09-29 19:28:38 +00:00
ntrifunovic
9d706078c8 Moved the current tree under the new v1_0 directory. For the new development, I'll create new branches.
git-svn-id: http://svn.code.sf.net/p/utfcpp/code@59 a809a056-fc17-0410-9590-b4f493f8b08e
2006-09-15 20:07:05 +00:00
46 changed files with 3209 additions and 1411 deletions

View file

@ -1,14 +0,0 @@
utf8 cpp library
Release 1.0 Final
This is the first production release of the library.
Bug Fixes:
No bugs reported since Beta 3.
Other changes from Beta 3:
- Performance tuning: The library is now biased towards texts with lots of ASCII characters.
- Vaaarious refactorings to improve readibility and reduce code duplication
Files: utf8.h utf8cpp.html ReleaseNotes

View file

@ -1,724 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">
<html>
<head>
<meta name="generator" content=
"HTML Tidy for Linux/x86 (vers 12 April 2005), see www.w3.org">
<meta name="description" content="A simple, portable and lightweigt C++ library for easy handling of UTF-8 encoded strings">
<meta name="keywords" content="UTF-8 C++ portable utf8 unicode generic templates">
<title>UTF8-CPP: UTF-8 with C++ in a Portable Way</title>
</head>
<body>
<p><a href="https://sourceforge.net/projects/utfcpp">The Sourceforge project page</a></p>
<h2>Table of Contents</h2>
<ul>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#examples">Examples of Use</a></li>
<li><a href="#reference">Reference</a></li>
<li><a href="#points">Points of Interest</a></li>
<li><a href="#conclusion">Conclusion</a></li>
<li><a href="#references">References</a></li>
</ul>
<h2 id="introduction">Introduction</h2>
<p>Many C++ developers miss an easy and portable way of handling
Unicode encoded strings. C++ Standard is currently Unicode
agnostic, and while some work is being done to introduce Unicode to
the next incarnation called C++0x, for the moment nothing of the
sort is available. In the meantime, developers use 3rd party
libraries like ICU, OS specific capabilities, or simply roll out
their own solutions.</p>
<p>In order to easily handle UTF-8 encoded Unicode strings, I have
come up with a set of template functions. For anybody used to work
with STL algorithms, they should be easy and natural to use. The
code is freely available for any purpose - check out the license at
the beginning of the utf8.h file. Be aware, though, that while I
did some testing, this library has not been used in production yet.
If you run into bugs or performance issues, please let me know and
I'll do my best to address them.</p>
<p>The purpose of this article is not to offer an introduction to
Unicode in general, and UTF-8 in particular. If you are not
familiar with Unicode, be sure to check out <a href=
"http://www.unicode.org/">Unicode Home Page</a> or some other
source of information for Unicode. Also, it is not my aim to
advocate the use of UTF-8 encoded strings in C++ programs; if you
want to handle UTF-8 encoded strings from C++, I am sure you have
good reasons for it.</p>
<h2 id="examples">Examples of use</h2>
<p>To illustrate the use of this utf8 library, we shall open a file
containing UTF-8 encoded text, check whether it starts with a byte order mark,
read each line into a <code>std::string</code>, check it for validity, convert the text to UTF-16,
and back to UTF-8:</p>
<pre>
#include &lt;fstream&gt;
#include &lt;iostream&gt;
#include &lt;string&gt;
#include &lt;vector&gt;
using namespace std;
int main()
{
if (argc != 2) {
cout &lt;&lt; "\nUsage: docsample filename\n";
return 0;
}
const char* test_file_path = argv[1];
// Open the test file (must be UTF-8 encoded)
ifstream fs8(test_file_path);
if (!fs8.is_open()) {
cout &lt;&lt; "Could not open " &lt;&lt; test_file_path &lt;&lt; endl;
return 0;
}
// Read the first line of the file
unsigned line_count = 1;
string line;
if (!getline(fs8, line))
return 0;
// Look for utf-8 byte-order mark at the beginning
if (line.size() &gt; 2) {
if (utf8::is_bom(line.c_str()))
cout &lt;&lt; "There is a byte order mark at the beginning of the file\n";
}
// Play with all the lines in the file
do {
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
if (end_it != line.end()) {
cout &lt;&lt; "Invalid UTF-8 encoding detected at line " &lt;&lt; line_count &lt;&lt; "\n";
cout &lt;&lt; "This part is fine: " &lt;&lt; string(line.begin(), end_it) &lt;&lt; "\n";
}
// Get the line length (at least for the valid part)
int length = utf8::distance(line.begin(), end_it);
cout &lt;&lt; "Length of line " &lt;&lt; line_count &lt;&lt; " is " &lt;&lt; length &lt;&lt; "\n";
// Convert it to utf-16
vector&lt;unsigned short&gt; utf16line;
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
// And back to utf-8;
string utf8line;
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
// Confirm that the conversion went OK:
if (utf8line != string(line.begin(), end_it))
cout &lt;&lt; "Error in UTF-16 conversion at line: " &lt;&lt; line_count &lt;&lt; "\n";
getline(fs8, line);
line_count++;
} while (!fs8.eof());
return 0;
}
</pre>
<p>In the previous code sample, we have seen the use of the following functions
from <code>utf8</code> namespace: first we used <code>is_bom</code>
function to detect UTF-8 byte order mark at the beginning of the
file; then for each line we performed a detection of invalid UTF-8 sequences with <code>find_invalid</code>;
the number of characters (more precisely - the number of Unicode code points) in each line was determined
with a use of <code>utf8::distance</code>; finally, we have converted each line to UTF-16 encoding with
<code>utf8to16</code> and back to UTF-8 with <code>utf16to8</code>.
</p>
<h2 id ="reference">Reference</h2>
<h3>Functions From utf8 Namespace</h3>
<h4>utf8::append</h4>
<p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
appends the sequence to a UTF-8 string.</p>
<code>template &lt;typename octet_iterator&gt; octet_iterator
append(uint32_t cp, octet_iterator result);</code>
<p><code>cp</code>: A 32 bit integer representing a code point to
append to the sequence.<br>
<code>result</code>: An output iterator to the place in the
sequence where to append the code point.<br>
<u>Return value</u>: An iterator pointing to the place after the
newly appended sequence.</p>
<p>Example of use:</p>
<pre>
unsigned char u[5] = {0,0,0,0,0};
unsigned char* end = append(0x0448, u);
assert (u[0] == 0xd1 &amp;&amp; u[1] == 0x88 &amp;&amp; u[2] == 0 &amp;&amp; u[3] == 0 &amp;&amp; u[4] == 0);
</pre>
<p>Note that <code>append</code> does not allocate any memory - it
is the burden of the caller to make sure there is enough memory
allocated for the operation. To make things more interesting,
<code>append</code> can add anywhere between 1 and 4 octets to the
sequence. In practice, you would most often want to use
<code>std::back_inserter</code> to ensure that the necessary memory
is allocated.</p>
<p>In case of an invalid code point, a
<code>utf8::invalid_code_point</code> exception is thrown.</p>
<h4>utf8::next</h4>
<p>Given the iterator to the beginning of the UTF-8 sequence, it
returns the code point and moves the iterator to the next
position.</p>
<code>template &lt;typename octet_iterator&gt; uint32_t
next(octet_iterator&amp; it, octet_iterator end);</code>
<p><code>it</code>: a reference to an iterator pointing to the
beginning of an UTF-8 encoded code point. After the function
returns, it is incremented to point to the beginning of the next
code point.<br>
<code>end</code>: end of the UTF-8 sequence to be processed. If
<code>it</code> gets equal to <code>end</code> during the
extraction of a code point, an <code>utf8::not_enough_room</code>
exception is thrown.<br>
<u>Return value</u>: the 32 bit representation of the processed
UTF-8 code point.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
char* w = twochars;
int cp = next(w, twochars + 6);
assert (cp == 0x65e5);
assert (w == twochars + 3);
</pre>
<p>This function is typically used to iterate through a UTF-8
encoded string.</p>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown.</p>
<h4>utf8::previous</h4>
<p>Given a reference to an iterator pointing to an octet in a UTF-8
seqence, it decreases the iterator until it hits the beginning of
the previous UTF-8 encoded code point and returns the 32 bits
representation of the code point.</p>
<code>template &lt;typename octet_iterator&gt; uint32_t
previous(octet_iterator&amp; it, octet_iterator pass_start);</code>
<p><code>it</code>: a reference pointing to an octet within a UTF-8
encoded string. After the function returns, it is decremented to
point to the beginning of the previous code point.<br>
<code>pass_start</code>: an iterator to the point in the sequence
where the search for the beginning of a code point is aborted if no
result was reached. It is a safety measure to prevent passing the
beginning of the string in the search for a UTF-8 lead octet.<br>
<u>Return value</u>: the 32 bit representation of the previous code
point.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
unsigned char* w = twochars + 3;
int cp = previous (w, twochars - 1);
assert (cp == 0x65e5);
assert (w == twochars);
</pre>
<p>The primary purpose of this function is to iterate backwards
through a UTF-8 encoded string. Therefore, <code>it</code> will
typically point to the beginning of a code point, and
<code>pass_start</code> will point to the octet just before the
beginning of the string to ensure we don't go backwards too far.
<code>it</code> is decreased until it points to a lead UTF-8 octet,
and then the UTF-8 sequence beginning with that octet is decoded to
a 32 bit representation and returned.</p>
<p>In case <code>pass_end</code> is reached before a UTF-8 lead
octet is hit, or if an invalid UTF-8 sequence is started by the
lead octet, an <code>invalid_utf8</code> exception is thrown</p>
<h4>utf8::advance</h4>
<p>Advances an iterator by the specified number of code points
within an UTF-8 sequence.</p>
<code>template &lt;typename octet_iterator, typename
distance_type&gt; void advance (octet_iterator&amp; it,
distance_type n, octet_iterator end);</code>
<p><code>it</code>: a reference to an iterator pointing to the
beginning of an UTF-8 encoded code point. After the function
returns, it is incremented to point to the nth following code
point.<br>
<code>n</code>: a positive integer that shows how many code points
we want to advance.<br>
<code>end</code>: end of the UTF-8 sequence to be processed. If
<code>it</code> gets equal to <code>end</code> during the
extraction of a code point, an <code>utf8::not_enough_room</code>
exception is thrown.<br></p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
unsigned char* w = twochars;
advance (w, 2, twochars + 6);
assert (w == twochars + 5);
</pre>
<p>This function works only "forward". In case of a negative
<code>n</code>, there is no effect.</p>
<p>In case of an invalid code point, a
<code>utf8::invalid_code_point</code> exception is thrown.</p>
<h4>utf8::distance</h4>
<p>Given the iterators to two UTF-8 encoded code points in a
seqence, returns the number of code points between them.</p>
<code>template &lt;typename octet_iterator&gt; typename
std::iterator_traits&lt;octet_iterator&gt;::difference_type
distance (octet_iterator first, octet_iterator last);</code>
<p><code>first</code>: an iterator to a beginning of a UTF-8
encoded code point.<br>
<code>last</code>: an iterator to a "post-end" of the last UTF-8
encoded code point in the sequence we are trying to determine the
length. It can be the beginning of a new code point, or not.<br>
<u>Return value</u> the distance between the iterators, in code
points.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
size_t dist = utf8::distance(twochars, twochars + 5);
assert (dist == 2);
</pre>
<p>This function is used to find the length (in code points) of a
UTF-8 encoded string. The reason it is called <em>distance</em>,
rather than, say, <em>length</em> is mainly because developers are
used that <em>length</em> is an O(1) function. Computing the length
of an UTF-8 string is a linear operation, and it looked better to
model it after <code>std::distance</code> algorithm.</p>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown. If
<code>last</code> does not point to the past-of-end of a UTF-8
seqence, a <code>utf8::not_enough_room</code> exception is
thrown.</p>
<h4>utf8::utf16to8</h4>
<p>Converts a UTF-16 encoded string to UTF-8.</p>
<code>template &lt;typename u16bit_iterator, typename
octet_iterator&gt; octet_iterator utf16to8 (u16bit_iterator start,
u16bit_iterator end, octet_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-16 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the
UTF-16 encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-8
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-8 string.</p>
<p>Example of use:</p>
<pre>
unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
vector&lt;unsigned char&gt; utf8result;
utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
assert (utf8result.size() == 10);
</pre>
<p>In case of invalid UTF-16 sequence, a
<code>utf8::invalid_utf16</code> exception is thrown.</p>
<h4>utf8::utf8to16</h4>
<p>Converts an UTF-8 encoded string to UTF-16</p>
<code>template &lt;typename u16bit_iterator, typename
octet_iterator&gt; u16bit_iterator utf8to16 (octet_iterator start,
octet_iterator end, u16bit_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 encoded string to convert. &lt; br /&gt; <code>end</code>: an
iterator pointing to pass-the-end of the UTF-8 encoded string to
convert.<br>
<code>result</code>: an output iterator to the place in the UTF-16
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-16 string.</p>
<p>Example of use:</p>
<pre>
char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
vector &lt;unsigned short&gt; utf16result;
utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[2] == 0xd834);
assert (utf16result[3] == 0xdd1e);
</pre>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown. If
<code>last</code> does not point to the past-of-end of a UTF-8
seqence, a <code>utf8::not_enough_room</code> exception is
thrown.</p>
<h4>utf8::utf32to8</h4>
<p>Converts a UTF-32 encoded string to UTF-8.</p>
<code>template &lt;typename octet_iterator, typename
u32bit_iterator&gt; octet_iterator utf32to8 (u32bit_iterator start,
u32bit_iterator end, octet_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-32 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the
UTF-32 encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-8
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-8 string.</p>
<p>Example of use:</p>
<pre>
int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
vector&lt;unsigned char&gt; utf8result;
utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
assert (utf8result.size() == 9);
</pre>
<p>In case of invalid UTF-32 string, a
<code>utf8::invalid_code_point</code> exception is thrown.</p>
<h4>utf8::utf8to32</h4>
<p>Converts a UTF-8 encoded string to UTF-32.</p>
<code>template &lt;typename octet_iterator, typename
u32bit_iterator&gt; u32bit_iterator utf8to32 (octet_iterator start,
octet_iterator end, u32bit_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-32
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-32 string.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
vector&lt;int&gt; utf32result;
utf8to32(twochars, twochars + 5, back_inserter(utf32result));
assert (utf32result.size() == 2);
</pre>
<p>In case of an invalid UTF-8 seqence, a
<code>utf8::invalid_utf8</code> exception is thrown. If
<code>last</code> does not point to the past-of-end of a UTF-8
seqence, a <code>utf8::not_enough_room</code> exception is
thrown.</p>
<h4>utf8::find_invalid</h4>
<p>Detects an invalid sequence within a UTF-8 string.</p>
<code>template &lt;typename octet_iterator&gt; octet_iterator
find_invalid(octet_iterator start, octet_iterator end);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 string to test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
string to test for validity.<br>
<u>Return value</u>: an iterator pointing to the first invalid
octet in the UTF-8 string. In case none were found, equals
<code>end</code>.</p>
<p>Example of use:</p>
<pre>
char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
assert (invalid == utf_invalid + 5);
</pre>
<p>This function is typically used to make sure a UTF-8 string is
valid before processing it with other functions. It is especially
important to call it if before doing any of the <em>unchecked</em>
operations on it.</p>
<h4>utf8::is_valid</h4>
<p>Checks whether a sequence of octets is a valid UTF-8 string.</p>
<code>template &lt;typename octet_iterator&gt; bool
is_valid(octet_iterator start, octet_iterator end);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 string to test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
string to test for validity.<br>
<u>Return value</u>: <code>true</code> if the sequence is a valid
UTF-8 string; <code>false</code> if not.</p>
Example of use:
<pre>
char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
assert (bvalid == false);
</pre>
<p><code>is_valid</code> is a shorthand for
<code>find_invalid(start, end) == end;</code>. You may want to use
it to make sure that a byte seqence is a valid UTF-8 string without
the need to know where it fails if it is not valid.</p>
<h4>utf8::is_bom</h4>
<p>Checks whether a sequence of three octets is a UTF-8 byte order
mark (BOM)</p>
<code>template &lt;typename octet_iterator&gt; bool is_bom
(octet_iterator it);</code>
<p><code>it</code> Beginning of the 3-octet sequence to check<br>
<u>Return value</u>: <code>true</code> if the sequence is UTF-8
byte order mark; <code>false</code> if not.</p>
<p>Example of use:</p>
<pre>
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark);
assert (bbom == true);
</pre>
<p>The typical use of this function is to check the first three
bytes of a file. If they form the UTF-8 BOM, we want to skip them
before processing the actual UTF-8 encoded text.</p>
<h3>Functions From utf8::unchecked Namespace</h3>
<h4>utf8::unchecked::append</h4>
<p>Encodes a 32 bit code point as a UTF-8 sequence of octets and
appends the sequence to a UTF-8 string.</p>
<code>template &lt;typename octet_iterator&gt; octet_iterator
append(uint32_t cp, octet_iterator result);</code>
<p><code>cp</code>: A 32 bit integer representing a code point to
append to the sequence.<br>
<code>result</code>: An output iterator to the place in the
sequence where to append the code point.<br>
<u>Return value</u>: An iterator pointing to the place after the
newly appended sequence.</p>
<p>Example of use:</p>
<pre>
unsigned char u[5] = {0,0,0,0,0};
unsigned char* end = unchecked::append(0x0448, u);
assert (u[0] == 0xd1 &amp;&amp; u[1] == 0x88 &amp;&amp; u[2] == 0 &amp;&amp; u[3] == 0 &amp;&amp; u[4] == 0);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::append</code>. It does not check for validity of the
supplied code point, and may produce an invalid UTF-8 sequence.</p>
<h4>utf8::unchecked::next</h4>
<p>Given the iterator to the beginning of a UTF-8 sequence, it
returns the code point and moves the iterator to the next
position.</p>
<code>template &lt;typename octet_iterator&gt; uint32_t
next(octet_iterator&amp; it);</code>
<p><code>it</code>: a reference to an iterator pointing to the
beginning of an UTF-8 encoded code point. After the function
returns, it is incremented to point to the beginning of the next
code point.<br>
<u>Return value</u>: the 32 bit representation of the processed
UTF-8 code point.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
char* w = twochars;
int cp = unchecked::next(w);
assert (cp == 0x65e5);
assert (w == twochars + 3);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::next</code>. It does not check for validity of the
supplied UTF-8 sequence.</p>
<h4>utf8::unchecked::previous</h4>
<p>Given a reference to an iterator pointing to an octet in a UTF-8
seqence, it decreases the iterator until it hits the beginning of
the previous UTF-8 encoded code point and returns the 32 bits
representation of the code point.</p>
<code>template &lt;typename octet_iterator&gt; uint32_t
previous(octet_iterator&amp; it);</code>
<p><code>it</code>: a reference pointing to an octet within a UTF-8
encoded string. After the function returns, it is decremented to
point to the beginning of the previous code point.<br>
<u>Return value</u>: the 32 bit representation of the previous code
point.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
char* w = twochars + 3;
int cp = unchecked::previous (w);
assert (cp == 0x65e5);
assert (w == twochars);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::previous</code>. It does not check for validity of the
supplied UTF-8 sequence and offers no boundary checking.</p>
<h4>utf8::unchecked::advance</h4>
<p>Advances an iterator by the specified number of code points
within an UTF-8 sequence.</p>
<code>template &lt;typename octet_iterator, typename
distance_type&gt; void advance (octet_iterator&amp; it,
distance_type n);</code>
<p><code>it</code>: a reference to an iterator pointing to the
beginning of an UTF-8 encoded code point. After the function
returns, it is incremented to point to the nth following code
point.<br>
<code>n</code>: a positive integer that shows how many code points
we want to advance.<br></p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
char* w = twochars;
unchecked::advance (w, 2);
assert (w == twochars + 5);
</pre>
<p>This function works only "forward". In case of a negative
<code>n</code>, there is no effect.</p>
<p>This is a quicker but less safe version of
<code>utf8::advance</code>. It does not check for validity of the
supplied UTF-8 sequence and offers no boundary checking.</p>
<h4>utf8::unchecked::distance</h4>
<p>Given the iterators to two UTF-8 encoded code points in a
seqence, returns the number of code points between them.</p>
<code>template &lt;typename octet_iterator&gt; typename
std::iterator_traits&lt;octet_iterator&gt;::difference_type
distance (octet_iterator first, octet_iterator last);</code>
<p><code>first</code>: an iterator to a beginning of a UTF-8
encoded code point.<br>
<code>last</code>: an iterator to a "post-end" of the last UTF-8
encoded code point in the sequence we are trying to determine the
length. It can be the beginning of a new code point, or not.<br>
<u>Return value</u> the distance between the iterators, in code
points.</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
assert (dist == 2);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::distance</code>. It does not check for validity of the
supplied UTF-8 sequence.</p>
<h4>utf8::unchecked::utf16to8</h4>
<p>Converts a UTF-16 encoded string to UTF-8.</p>
<code>template &lt;typename u16bit_iterator, typename
octet_iterator&gt; octet_iterator utf16to8 (u16bit_iterator start,
u16bit_iterator end, octet_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-16 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the
UTF-16 encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-8
string where to append the result of conversion.
<u>Return value</u>: An iterator pointing to the place after the appended UTF-8 string.</p>
<p>Example of use:</p>
<pre>
unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
vector&lt;unsigned char&gt; utf8result;
unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
assert (utf8result.size() == 10);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::utf16to8</code>. It does not check for validity of the
supplied UTF-16 sequence.</p>
<h4>utf8::unchecked::utf8to16</h4>
<p>Converts an UTF-8 encoded string to UTF-16</p>
<code>template &lt;typename u16bit_iterator, typename
octet_iterator&gt; u16bit_iterator utf8to16 (octet_iterator start,
octet_iterator end, u16bit_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 encoded string to convert. &lt; br /&gt; <code>end</code>: an
iterator pointing to pass-the-end of the UTF-8 encoded string to
convert.<br>
<code>result</code>: an output iterator to the place in the UTF-16
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-16 string.
</p>
<p>Example of use:</p>
<pre>
char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
vector &lt;unsigned short&gt; utf16result;
unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[2] == 0xd834);
assert (utf16result[3] == 0xdd1e);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::utf8to16</code>. It does not check for validity of the
supplied UTF-8 sequence.</p>
<h4>utf8::unchecked::utf32to8</h4>
<p>Converts a UTF-32 encoded string to UTF-8.</p>
<code>template &lt;typename octet_iterator, typename
u32bit_iterator&gt; octet_iterator utf32to8 (u32bit_iterator start,
u32bit_iterator end, octet_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-32 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the
UTF-32 encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-8
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-8 string.
</p>
<p>Example of use:</p>
<pre>
int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
vector&lt;unsigned char&gt; utf8result;
utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
assert (utf8result.size() == 9);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::utf32to8</code>. It does not check for validity of the
supplied UTF-32 sequence.</p>
<h4>utf8::unchecked::utf8to32</h4>
<p>Converts a UTF-8 encoded string to UTF-32.</p>
<code>template &lt;typename octet_iterator, typename
u32bit_iterator&gt; u32bit_iterator utf8to32 (octet_iterator start,
octet_iterator end, u32bit_iterator result);</code>
<p><code>start</code>: an iterator pointing to the beginning of the
UTF-8 encoded string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8
encoded string to convert.<br>
<code>result</code>: an output iterator to the place in the UTF-32
string where to append the result of conversion.<br>
<u>Return value</u>: An iterator pointing to the place after the appended UTF-32 string.
</p>
<p>Example of use:</p>
<pre>
char* twochars = "\xe6\x97\xa5\xd1\x88";
vector&lt;int&gt; utf32result;
unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
assert (utf32result.size() == 2);
</pre>
<p>This is a quicker but less safe version of
<code>utf8::utf8to32</code>. It does not check for validity of the
supplied UTF-8 sequence.</p>
<h2 id="points">Points of interest</h2>
<h4>Design goals and decisions</h4>
<p>The library was designed to be:</p>
<ol>
<li>Generic: for better or worse, there are many C++ string classes
out there, and the library should work with as many of them as
possible.</li>
<li>Portable: the library should be portable both accross different
platforms and compilers. The only non-portable code is a small
section that declares unsigned integers of different sizes: three
typedefs. They can be changed by the users of the library if they
don't match their platform. The default setting should work for
Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix
derivatives.</li>
<li>Lightweight: follow the "pay only for what you use"
guidline.</li>
<li>Unintrusive: avoid forcing any particular design or even
programming style on the user. This is a library, not a
framework.</li>
</ol>
<h4>Alternatives</h4>
<p>In case you want to look into other means of working with UTF-8
strings from C++, here is the list of solutions I am aware of:</p>
<ol>
<li><a href="http://icu.sourceforge.net/">ICU Library</a>. It is
very powerful, complete, feature-rich, mature, and widely used.
Also big, intrusive, non-generic, and doesn't play well with the
Standard Library. I definitelly recommend looking at ICU even if
you don't plan to use it.</li>
<li><a href=
"http://www.gtkmm.org/gtkmm2/docs/tutorial/html/ch03s04.html">Glib::ustring</a>.
A class specifically made to work with UTF-8 strings, and also feel
like <code>std::string</code>. If you prefer to have yet another
string class in your code, it may be worth a look. Be aware of the
licensing issues, though.</li>
<li>Platform dependent solutions: Windows and POSIX have functions
to convert strings from one encoding to another. That is only a
subset of what my library offers, but if that is all you need it
may be good enough, especially given the fact that these functions
are mature and tested in production.</li>
</ol>
<h2 id="conclusion">Conclusion</h2>
<p>Until Unicode becomes officially recognized by the C++ Standard
Library, we need to use other means to work with UTF-8 strings.
Template functions I describe in this article may be a good step in
this direction.</p>
<h2 id="references">References</h2>
<ol>
<li><a href="http://www.unicode.org/">The Unicode
Consortium</a>.</li>
<li><a href="http://icu.sourceforge.net/">ICU Library</a>.</li>
<li><a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8 at
Wikipedia</a></li>
<li><a href="http://www.cl.cam.ac.uk/~mgk25/unicode.html">UTF-8 and Unicode FAQ for Unix/Linux</a></li>
</ol>
</body>
</html>

View file

@ -1,551 +0,0 @@
// Copyright (c) 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include <iterator>
#include <exception>
namespace utf8
{
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
// You may need to change them to match your system.
// These typedefs have the same names as ones from cstdint, or boost/cstdint
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public std::exception {
uint32_t cp;
public:
invalid_code_point(uint32_t cp) : cp(cp) {}
const char* what() { return "Invalid code point"; }
uint32_t code_point() const {return cp;}
};
class invalid_utf8 : public std::exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
const char* what() { return "Invalid UTF-8"; }
uint8_t utf8_octet() const {return u8;}
};
class invalid_utf16 : public std::exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
const char* what() { return "Invalid UTF-16"; }
uint16_t utf16_word() const {return u16;}
};
class not_enough_room : public std::exception {
public:
const char* what() { return "Not enough space"; }
};
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
// Maximum valid value for a Unicode code point
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
template<typename octet_type>
inline uint8_t mask8(octet_type oc)
{
return static_cast<uint8_t>(0xff & oc);
}
template<typename u16_type>
inline uint16_t mask16(u16_type oc)
{
return static_cast<uint16_t>(0xffff & oc);
}
template<typename octet_type>
inline bool is_trail(octet_type oc)
{
return ((mask8(oc) >> 6) == 0x2);
}
template <typename u16>
inline bool is_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
}
template <typename octet_iterator>
inline typename std::iterator_traits<octet_iterator>::difference_type
sequence_length(octet_iterator lead_it)
{
uint8_t lead = mask8(*lead_it);
if (lead < 0x80)
return 1;
else if ((lead >> 5) == 0x6)
return 2;
else if ((lead >> 4) == 0xe)
return 3;
else if ((lead >> 3) == 0x1e)
return 4;
else
return 0;
}
enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point = 0)
{
uint32_t cp = mask8(*it);
// Check the lead octet
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_differece_type;
octet_differece_type length = sequence_length(it);
// "Shortcut" for ASCII characters
if (length == 1) {
if (end - it > 0) {
if (code_point)
*code_point = cp;
++it;
return OK;
}
else
return NOT_ENOUGH_ROOM;
}
// Do we have enough memory?
if (end - it < length)
return NOT_ENOUGH_ROOM;
// Check trail octets and calculate the code point
switch (length) {
case 0:
return INVALID_LEAD;
break;
case 2:
if (is_trail(*(++it))) {
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
}
else {
--it;
return INCOMPLETE_SEQUENCE;
}
break;
case 3:
if (is_trail(*(++it))) {
cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
if (is_trail(*(++it))) {
cp += (*it) & 0x3f;
}
else {
--it; --it;
return INCOMPLETE_SEQUENCE;
}
}
else {
--it;
return INCOMPLETE_SEQUENCE;
}
break;
case 4:
if (is_trail(*(++it))) {
cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
if (is_trail(*(++it))) {
cp += (mask8(*it) << 6) & 0xfff;
if (is_trail(*(++it))) {
cp += (*it) & 0x3f;
}
else {
--it; --it; --it;
return INCOMPLETE_SEQUENCE;
}
}
else {
--it; --it;
return INCOMPLETE_SEQUENCE;
}
}
else {
--it;
return INCOMPLETE_SEQUENCE;
}
break;
}
// Is the code point valid?
if (!is_code_point_valid(cp)) {
for (octet_differece_type i = 0; i < length - 1; ++i)
--it;
return INVALID_CODE_POINT;
}
if (code_point)
*code_point = cp;
if (cp < 0x80) {
if (length != 1) {
for (octet_differece_type i = 0; i < length - 1; ++i)
--it;
return OVERLONG_SEQUENCE;
}
}
else if (cp < 0x800) {
if (length != 2) {
for (octet_differece_type i = 0; i < length - 1; ++i)
--it;
return OVERLONG_SEQUENCE;
}
}
else if (cp < 0x10000) {
if (length != 3) {
for (octet_differece_type i = 0; i < length - 1; ++i)
--it;
return OVERLONG_SEQUENCE;
}
}
++it;
return OK;
}
} // namespace internal
/// The library API - functions intended to be called by the users
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
octet_iterator result = start;
while (result != end) {
internal::utf_error err_code = internal::validate_next(result, end);
if (err_code != internal::OK)
return result;
}
return result;
}
template <typename octet_iterator>
bool is_valid(octet_iterator start, octet_iterator end)
{
return (find_invalid(start, end) == end);
}
template <typename octet_iterator>
bool is_bom (octet_iterator it)
{
return (
(internal::mask8(*it++)) == bom[0] &&
(internal::mask8(*it++)) == bom[1] &&
(internal::mask8(*it)) == bom[2]
);
}
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (!internal::is_code_point_valid(cp))
throw invalid_code_point(cp);
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp <= internal::CODE_POINT_MAX) { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else
throw invalid_code_point(cp);
return result;
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it, octet_iterator end)
{
uint32_t cp = 0;
internal::utf_error err_code = internal::validate_next(it, end, &cp);
switch (err_code) {
case internal::OK :
break;
case internal::NOT_ENOUGH_ROOM :
throw not_enough_room();
case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE :
throw invalid_utf8(*it);
case internal::INVALID_CODE_POINT :
throw invalid_code_point(cp);
}
return cp;
}
template <typename octet_iterator>
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
{
octet_iterator end = it;
while (internal::is_trail(*(--it)))
if (it == pass_start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
octet_iterator temp = it;
return next(temp, end);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
{
for (distance_type i = 0; i < n; ++i)
next(it, end);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
next(first, last);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = internal::mask16(*start++);
// Take care of surrogate pairs first
if (internal::is_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = internal::mask16(*start++);
if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(*start));
}
result = append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
uint32_t cp = next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = next(start, end);
return result;
}
namespace unchecked
{
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it)
{
uint32_t cp = internal::mask8(*it);
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
switch (length) {
case 1:
break;
case 2:
it++;
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
break;
case 3:
++it;
cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
++it;
cp += (*it) & 0x3f;
break;
case 4:
++it;
cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);
++it;
cp += (internal::mask8(*it) << 6) & 0xfff;
++it;
cp += (*it) & 0x3f;
break;
}
++it;
return cp;
}
template <typename octet_iterator>
uint32_t previous(octet_iterator& it)
{
while (internal::is_trail(*(--it))) ;
octet_iterator temp = it;
return next(temp);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
for (distance_type i = 0; i < n; ++i)
next(it);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
next(first);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = internal::mask16(*start++);
// Take care of surrogate pairs first
if (internal::is_surrogate(cp)) {
uint32_t trail_surrogate = internal::mask16(*start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
uint32_t cp = next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = next(start);
return result;
}
} // namespace utf8::unchecked
} // namespace utf8
#endif // header guard

View file

@ -1,5 +0,0 @@
CC = g++
CFLAGS = -g -Wall -pedantic
negativetest: negative.cpp ../../source/utf8.h
$(CC) $(CFLAGS) negative.cpp -onegative

View file

@ -1,39 +0,0 @@
#include "../../source/utf8.h"
using namespace utf8;
#include <string>
#include <iostream>
#include <fstream>
#include <algorithm>
using namespace std;
const char* TEST_FILE_PATH = "../../test_data/negative/utf8_invalid.txt";
const unsigned INVALID_LINES[] = { 75, 76, 82, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264, 268, 269};
const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
int main()
{
// Open the test file
ifstream fs8(TEST_FILE_PATH);
if (!fs8.is_open()) {
cout << "Could not open " << TEST_FILE_PATH << endl;
return 0;
}
// Read it line by line
unsigned int line_count = 0;
char byte;
while (!fs8.eof()) {
string line;
while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
line.push_back(byte);
line_count++;
// Print out lines that contain invalid UTF-8
if (!is_valid(line.begin(), line.end())) {
const unsigned* u = find(INVALID_LINES, INVALID_LINES_END, line_count);
if (u == INVALID_LINES_END)
cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
}
}
}

View file

@ -1,5 +0,0 @@
CC = g++
CFLAGS = -O2
iconvtest: iconvtest.cpp ../../source/utf8.h timer.h
$(CC) $(CFLAGS) iconvtest.cpp -oiconvtest

View file

@ -1,6 +0,0 @@
CC = g++
CFLAGS = -g -Wall -pedantic
REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h
regressiontest: reg_tests_driver.cpp ../../source/utf8.h $(REG_FILES)
$(CC) $(CFLAGS) reg_tests_driver.cpp -o regressiontest

View file

@ -1,5 +0,0 @@
CC = g++
CFLAGS = -g -Wall
smoketest: test.cpp ../../source/utf8.h
$(CC) $(CFLAGS) test.cpp -osmoketest

View file

@ -1,5 +0,0 @@
CC = g++
CFLAGS = -g -Wall -pedantic
utf8readertest: utf8reader.cpp ../../source/utf8.h
$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader

23
v2_0/LICENSE Normal file
View file

@ -0,0 +1,23 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View file

@ -1,6 +1,6 @@
#! /usr/bin/perl #! /usr/bin/perl
$release_files = 'source/utf8.h doc/utf8cpp.html doc/ReleaseNotes'; $release_files = 'source/utf8.h source/utf8/core.h source/utf8/checked.h source/utf8/unchecked.h doc/utf8cpp.html doc/ReleaseNotes';
# First get the latest version # First get the latest version
`svn update`; `svn update`;

12
v2_0/doc/ReleaseNotes Normal file
View file

@ -0,0 +1,12 @@
utf8 cpp library
Release 2.3.4
A minor bug fix release. Thanks to all who reported bugs.
Note: Version 2.3.3 contained a regression, and therefore was removed.
Changes from version 2.3.2
- Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';'
- Bug fix [36]: replace_invalid() only works with back_inserter
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes

1789
v2_0/doc/utf8cpp.html Normal file

File diff suppressed because it is too large Load diff

5
v2_0/samples/Makefile Normal file
View file

@ -0,0 +1,5 @@
CC = g++
CFLAGS = -g -Wall -pedantic
docsample: docsample.cpp ../source/utf8.h
$(CC) $(CFLAGS) docsample.cpp -odocsample

View file

@ -21,20 +21,10 @@ int main(int argc, char** argv)
return 0; return 0;
} }
// Read the first line of the file
unsigned line_count = 1; unsigned line_count = 1;
string line; string line;
if (!getline(fs8, line))
return 0;
// Look for utf-8 byte-order mark at the beginning
if (line.size() > 2) {
if (utf8::is_bom(line.c_str()))
cout << "There is a byte order mark at the beginning of the file\n";
}
// Play with all the lines in the file // Play with all the lines in the file
do { while (getline(fs8, line)) {
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
if (end_it != line.end()) { if (end_it != line.end()) {
@ -55,9 +45,8 @@ int main(int argc, char** argv)
if (utf8line != string(line.begin(), end_it)) if (utf8line != string(line.begin(), end_it))
cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
getline(fs8, line);
line_count++; line_count++;
} while (!fs8.eof()); }
return 0; return 0;
} }

34
v2_0/source/utf8.h Normal file
View file

@ -0,0 +1,34 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
#endif // header guard

327
v2_0/source/utf8/checked.h Normal file
View file

@ -0,0 +1,327 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
#include <stdexcept>
namespace utf8
{
// Base for the exceptions that may be thrown from the library
class exception : public ::std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public exception {
uint32_t cp;
public:
invalid_code_point(uint32_t cp) : cp(cp) {}
virtual const char* what() const throw() { return "Invalid code point"; }
uint32_t code_point() const {return cp;}
};
class invalid_utf8 : public exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
virtual const char* what() const throw() { return "Invalid UTF-8"; }
uint8_t utf8_octet() const {return u8;}
};
class invalid_utf16 : public exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
virtual const char* what() const throw() { return "Invalid UTF-16"; }
uint16_t utf16_word() const {return u16;}
};
class not_enough_room : public exception {
public:
virtual const char* what() const throw() { return "Not enough space"; }
};
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (!utf8::internal::is_code_point_valid(cp))
throw invalid_code_point(cp);
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = utf8::internal::validate_next(start, end);
switch (err_code) {
case internal::UTF8_OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
throw not_enough_room();
case internal::INVALID_LEAD:
out = utf8::append (replacement, out);
++start;
break;
case internal::INCOMPLETE_SEQUENCE:
case internal::OVERLONG_SEQUENCE:
case internal::INVALID_CODE_POINT:
out = utf8::append (replacement, out);
++start;
// just one replacement mark for the sequence
while (start != end && utf8::internal::is_trail(*start))
++start;
break;
}
}
return out;
}
template <typename octet_iterator, typename output_iterator>
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
{
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
return utf8::replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it, octet_iterator end)
{
uint32_t cp = 0;
internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
switch (err_code) {
case internal::UTF8_OK :
break;
case internal::NOT_ENOUGH_ROOM :
throw not_enough_room();
case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE :
throw invalid_utf8(*it);
case internal::INVALID_CODE_POINT :
throw invalid_code_point(cp);
}
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it, octet_iterator end)
{
return utf8::next(it, end);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
{
// can't do much if it == start
if (it == start)
throw not_enough_room();
octet_iterator end = it;
// Go back until we hit either a lead octet or start
while (utf8::internal::is_trail(*(--it)))
if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
return utf8::peek_next(it, end);
}
/// Deprecated in versions that include "prior"
template <typename octet_iterator>
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
{
octet_iterator end = it;
while (utf8::internal::is_trail(*(--it)))
if (it == pass_start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
octet_iterator temp = it;
return utf8::next(temp, end);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
{
for (distance_type i = 0; i < n; ++i)
utf8::next(it, end);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::next(first, last);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
if (utf8::internal::is_trail_surrogate(trail_surrogate))
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(cp));
}
// Lone trail surrogate
else if (utf8::internal::is_trail_surrogate(cp))
throw invalid_utf16(static_cast<uint16_t>(cp));
result = utf8::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
uint32_t cp = utf8::next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start != end)
(*result++) = utf8::next(start, end);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
octet_iterator it;
octet_iterator range_start;
octet_iterator range_end;
public:
iterator () {}
explicit iterator (const octet_iterator& octet_it,
const octet_iterator& range_start,
const octet_iterator& range_end) :
it(octet_it), range_start(range_start), range_end(range_end)
{
if (it < range_start || it > range_end)
throw std::out_of_range("Invalid utf-8 iterator position");
}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::next(temp, range_end);
}
bool operator == (const iterator& rhs) const
{
if (range_start != rhs.range_start || range_end != rhs.range_end)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
utf8::next(it, range_end);
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
utf8::next(it, range_end);
return temp;
}
iterator& operator -- ()
{
utf8::prior(it, range_start);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::prior(it, range_start);
return temp;
}
}; // class iterator
} // namespace utf8
#endif //header guard

329
v2_0/source/utf8/core.h Normal file
View file

@ -0,0 +1,329 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include <iterator>
namespace utf8
{
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
// You may need to change them to match your system.
// These typedefs have the same names as ones from cstdint, or boost/cstdint
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
// Maximum valid value for a Unicode code point
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
template<typename octet_type>
inline uint8_t mask8(octet_type oc)
{
return static_cast<uint8_t>(0xff & oc);
}
template<typename u16_type>
inline uint16_t mask16(u16_type oc)
{
return static_cast<uint16_t>(0xffff & oc);
}
template<typename octet_type>
inline bool is_trail(octet_type oc)
{
return ((utf8::internal::mask8(oc) >> 6) == 0x2);
}
template <typename u16>
inline bool is_lead_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
}
template <typename u16>
inline bool is_trail_surrogate(u16 cp)
{
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u16>
inline bool is_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
template <typename octet_iterator>
inline typename std::iterator_traits<octet_iterator>::difference_type
sequence_length(octet_iterator lead_it)
{
uint8_t lead = utf8::internal::mask8(*lead_it);
if (lead < 0x80)
return 1;
else if ((lead >> 5) == 0x6)
return 2;
else if ((lead >> 4) == 0xe)
return 3;
else if ((lead >> 3) == 0x1e)
return 4;
else
return 0;
}
template <typename octet_difference_type>
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
{
if (cp < 0x80) {
if (length != 1)
return true;
}
else if (cp < 0x800) {
if (length != 2)
return true;
}
else if (cp < 0x10000) {
if (length != 3)
return true;
}
return false;
}
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
/// Helper for get_sequence_x
template <typename octet_iterator>
utf_error increase_safely(octet_iterator& it, octet_iterator end)
{
if (++it == end)
return NOT_ENOUGH_ROOM;
if (!utf8::internal::is_trail(*it))
return INCOMPLETE_SEQUENCE;
return UTF8_OK;
}
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
/// get_sequence_x functions decode utf-8 sequences of the length x
template <typename octet_iterator>
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
// Save the original value of it so we can go back in case of failure
// Of course, it does not make much sense with i.e. stream iterators
octet_iterator original_it = it;
uint32_t cp = 0;
// Determine the sequence length based on the lead octet
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
const octet_difference_type length = utf8::internal::sequence_length(it);
// Get trail octets and calculate the code point
utf_error err = UTF8_OK;
switch (length) {
case 0:
return INVALID_LEAD;
case 1:
err = utf8::internal::get_sequence_1(it, end, cp);
break;
case 2:
err = utf8::internal::get_sequence_2(it, end, cp);
break;
case 3:
err = utf8::internal::get_sequence_3(it, end, cp);
break;
case 4:
err = utf8::internal::get_sequence_4(it, end, cp);
break;
}
if (err == UTF8_OK) {
// Decoding succeeded. Now, security checks...
if (utf8::internal::is_code_point_valid(cp)) {
if (!utf8::internal::is_overlong_sequence(cp, length)){
// Passed! Return here.
code_point = cp;
++it;
return UTF8_OK;
}
else
err = OVERLONG_SEQUENCE;
}
else
err = INVALID_CODE_POINT;
}
// Failure branch - restore the original value of the iterator
it = original_it;
return err;
}
template <typename octet_iterator>
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
uint32_t ignored;
return utf8::internal::validate_next(it, end, ignored);
}
} // namespace internal
/// The library API - functions intended to be called by the users
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
octet_iterator result = start;
while (result != end) {
utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
if (err_code != internal::UTF8_OK)
return result;
}
return result;
}
template <typename octet_iterator>
inline bool is_valid(octet_iterator start, octet_iterator end)
{
return (utf8::find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
);
}
//Deprecated in release 2.3
template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{
return (
(utf8::internal::mask8(*it++)) == bom[0] &&
(utf8::internal::mask8(*it++)) == bom[1] &&
(utf8::internal::mask8(*it)) == bom[2]
);
}
} // namespace utf8
#endif // header guard

View file

@ -0,0 +1,228 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
namespace utf8
{
namespace unchecked
{
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it)
{
uint32_t cp = utf8::internal::mask8(*it);
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
switch (length) {
case 1:
break;
case 2:
it++;
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
break;
case 3:
++it;
cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
++it;
cp += (*it) & 0x3f;
break;
case 4:
++it;
cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
++it;
cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
++it;
cp += (*it) & 0x3f;
break;
}
++it;
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it)
{
return utf8::unchecked::next(it);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it)
{
while (utf8::internal::is_trail(*(--it))) ;
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
// Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
template <typename octet_iterator>
inline uint32_t previous(octet_iterator& it)
{
return utf8::unchecked::prior(it);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
for (distance_type i = 0; i < n; ++i)
utf8::unchecked::next(it);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::unchecked::next(first);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = utf8::unchecked::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::unchecked::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = utf8::unchecked::next(start);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
octet_iterator it;
public:
iterator () {}
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
bool operator == (const iterator& rhs) const
{
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
::std::advance(it, utf8::internal::sequence_length(it));
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
::std::advance(it, utf8::internal::sequence_length(it));
return temp;
}
iterator& operator -- ()
{
utf8::unchecked::prior(it);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::unchecked::prior(it);
return temp;
}
}; // class iterator
} // namespace utf8::unchecked
} // namespace utf8
#endif // header guard

View file

@ -0,0 +1,5 @@
CC = g++
CFLAGS = -g -Wall -pedantic
negativetest: negative.cpp ../../source/utf8.h ../../source/utf8/core.h ../../source/utf8/checked.h ../../source/utf8/unchecked.h
$(CC) $(CFLAGS) negative.cpp -onegative

View file

@ -0,0 +1,53 @@
#include "../../source/utf8.h"
using namespace utf8;
#include <string>
#include <iostream>
#include <fstream>
#include <algorithm>
using namespace std;
const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
int main(int argc, char** argv)
{
string test_file_path;
if (argc == 2)
test_file_path = argv[1];
else {
cout << "Wrong number of arguments" << endl;
exit(0);
}
// Open the test file
ifstream fs8(test_file_path.c_str());
if (!fs8.is_open()) {
cout << "Could not open " << test_file_path << endl;
return 0;
}
// Read it line by line
unsigned int line_count = 0;
char byte;
while (!fs8.eof()) {
string line;
while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
line.push_back(byte);
line_count++;
bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
// Print out lines that contain unexpected invalid UTF-8
if (!is_valid(line.begin(), line.end())) {
if (expected_valid)
cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
// try fixing it:
string fixed_line;
replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
if (!is_valid(fixed_line.begin(), fixed_line.end()))
cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
}
else if (!expected_valid)
cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
}
}

View file

@ -0,0 +1,5 @@
CC = g++
CFLAGS = -O3
iconvtest: iconvtest.cpp ../../source/utf8.h timer.h ../../source/utf8/core.h ../../source/utf8/checked.h ../../source/utf8/unchecked.h
$(CC) $(CFLAGS) iconvtest.cpp -oiconvtest

View file

@ -1,4 +1,5 @@
#include <iconv.h> #include <iconv.h>
#include <string.h>
#include "../../source/utf8.h" #include "../../source/utf8.h"
#include "timer.h" #include "timer.h"
#include <fstream> #include <fstream>
@ -45,6 +46,7 @@ int main(int argc, char** argv)
cout << "utf8::utf8to16: "; cout << "utf8::utf8to16: ";
timer t(cout); timer t(cout);
utf8::utf8to16(buf, buf + length, utf16buf); utf8::utf8to16(buf, buf + length, utf16buf);
t.print_time();
} }
{ {
@ -53,6 +55,7 @@ int main(int argc, char** argv)
cout << "unchecked::utf8to16: "; cout << "unchecked::utf8to16: ";
timer t(cout); timer t(cout);
utf8::unchecked::utf8to16(buf, buf + length, utf16buf); utf8::unchecked::utf8to16(buf, buf + length, utf16buf);
t.print_time();
} }
// the UTF-16 result will not be larger than this (I hope :) ) // the UTF-16 result will not be larger than this (I hope :) )
@ -74,6 +77,7 @@ int main(int argc, char** argv)
{ {
timer t(cout); timer t(cout);
iconv(cd, &inbuf, &in_bytes_left, &outbuf, &out_bytes_left); iconv(cd, &inbuf, &in_bytes_left, &outbuf, &out_bytes_left);
t.print_time();
} }
iconv_close(cd); iconv_close(cd);
} }
@ -101,6 +105,7 @@ int main(int argc, char** argv)
{ {
timer t(cout); timer t(cout);
iconv(cd, &inbuf, &in_bytes_left, &outbuf, &out_bytes_left); iconv(cd, &inbuf, &in_bytes_left, &outbuf, &out_bytes_left);
t.print_time();
} }
iconv_close(cd); iconv_close(cd);
} }
@ -111,6 +116,7 @@ int main(int argc, char** argv)
cout << "unchecked::utf16to8: "; cout << "unchecked::utf16to8: ";
timer t(cout); timer t(cout);
utf8::unchecked::utf16to8(utf16buf, utf16buf + wlength, buf); utf8::unchecked::utf16to8(utf16buf, utf16buf + wlength, buf);
t.print_time();
} }
{ {
@ -118,6 +124,7 @@ int main(int argc, char** argv)
cout << "utf16to8: "; cout << "utf16to8: ";
timer t(cout); timer t(cout);
utf8::utf16to8(utf16buf, utf16buf + wlength, buf); utf8::utf16to8(utf16buf, utf16buf + wlength, buf);
t.print_time();
} }
delete [] buf; delete [] buf;

View file

@ -3,19 +3,20 @@
struct timer { struct timer {
timer(std::ostream& report) : report(report) timer(std::ostream& report) : report(report)
{start = std::clock();} {start = std::clock();}
~timer()
void print_time()
{ {
using namespace std; using namespace std;
end = clock(); clock_t now = clock();
unsigned milliseconds = (end - start)*1000 / CLOCKS_PER_SEC; unsigned milliseconds = (now - start)*1000 / CLOCKS_PER_SEC;
report << "Spent " << milliseconds << "ms here\n"; report << "Spent " << milliseconds << "ms here\n";
} }
std::clock_t start; std::clock_t start;
std::clock_t end;
std::ostream& report; std::ostream& report;
private: private:
// just to surpress a VC++ 8.0 warning // just to surpress a VC++ 8.0 warning
timer& operator = (const timer&) {}; timer& operator = (const timer&);
timer(const timer&);
}; };

View file

@ -44,6 +44,7 @@ int main(int argc, char** argv)
cout << "utf8::utf8to16: "; cout << "utf8::utf8to16: ";
timer t(cout); timer t(cout);
utf8::utf8to16(buf, buf + length, utf16buf); utf8::utf8to16(buf, buf + length, utf16buf);
t.print_time();
} }
{ {
@ -52,36 +53,38 @@ int main(int argc, char** argv)
cout << "unchecked::utf8to16: "; cout << "unchecked::utf8to16: ";
timer t(cout); timer t(cout);
utf8::unchecked::utf8to16(buf, buf + length, utf16buf); utf8::unchecked::utf8to16(buf, buf + length, utf16buf);
t.print_time();
} }
// the UTF-16 result will not be larger than this (I hope :) ) // the UTF-16 result will not be larger than this (I hope :) )
wchar_t* utf16iconvbuf = new wchar_t[wlength]; wchar_t* utf16iconvbuf = new wchar_t[wlength];
{ {
memset (utf16iconvbuf, 0 , wlength * sizeof(wchar_t)); memset (utf16iconvbuf, 0 , wlength * sizeof(wchar_t));
// win32 // win32
cout << "win32: "; cout << "win32: ";
{ {
timer t(cout); timer t(cout);
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, length, utf16iconvbuf, int(wlength)); MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, length, utf16iconvbuf, int(wlength));
t.print_time();
} }
} }
// just check the correctness while we are here: // just check the correctness while we are here:
if (!equal(utf16buf, utf16buf + wlength, utf16iconvbuf)) if (!equal(utf16buf, utf16buf + wlength, utf16iconvbuf))
cout << "Different result!!!"; cout << "Different result!!!";
// the other way around // the other way around
cout << "UTF16 to UTF-8\n"; cout << "UTF16 to UTF-8\n";
{ {
//win32 //win32
memset(buf, 0, length); memset(buf, 0, length);
cout<< "win32: "; cout<< "win32: ";
{ {
timer t(cout); timer t(cout);
WideCharToMultiByte(CP_UTF8, 0, utf16buf, int(wlength), buf, length, NULL, NULL); WideCharToMultiByte(CP_UTF8, 0, utf16buf, int(wlength), buf, length, NULL, NULL);
t.print_time();
} }
} }
@ -91,15 +94,17 @@ int main(int argc, char** argv)
cout << "unchecked::utf16to8: "; cout << "unchecked::utf16to8: ";
timer t(cout); timer t(cout);
utf8::unchecked::utf16to8(utf16buf, utf16buf + wlength, buf); utf8::unchecked::utf16to8(utf16buf, utf16buf + wlength, buf);
t.print_time();
} }
{ {
memset (buf, 0 , length); memset (buf, 0 , length);
cout << "utf16to8: "; cout << "utf16to8: ";
timer t(cout); timer t(cout);
utf8::utf16to8(utf16buf, utf16buf + wlength, buf); utf8::utf16to8(utf16buf, utf16buf + wlength, buf);
t.print_time();
} }
delete [] buf; delete [] buf;
delete [] utf16buf; delete [] utf16buf;
} }

View file

@ -0,0 +1,6 @@
CC = g++
CFLAGS = -g -Wall -pedantic
REG_FILES = r1_0Beta1/*h r1_0Beta2/*.h
regressiontest: reg_tests_driver.cpp ../../source/utf8.h ../../source/utf8/core.h ../../source/utf8/checked.h ../../source/utf8/unchecked.h $(REG_FILES)
$(CC) $(CFLAGS) reg_tests_driver.cpp -o regressiontest

View file

@ -50,13 +50,6 @@ check (!is_valid(udbff_dc00, udbff_dc00 + 6));
unsigned char udbff_dfff[] = {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}; unsigned char udbff_dfff[] = {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf};
check (!is_valid(udbff_dfff, udbff_dfff + 6)); check (!is_valid(udbff_dfff, udbff_dfff + 6));
// Other illegal code points
unsigned char ufffe[] = {0xef, 0xbf, 0xbe};
check (!is_valid(ufffe, ufffe + 3));
unsigned char uffff[] = {0xef, 0xbf, 0xbf};
check (!is_valid(uffff, uffff + 3));
} }
// [ 1525236 ] utf8::is_valid does not detect overlong sequences // [ 1525236 ] utf8::is_valid does not detect overlong sequences

View file

@ -4,8 +4,8 @@ using namespace utf8;
// [ 1538338 ] unchecked::next does not work correctly for 4-byte sequences. // [ 1538338 ] unchecked::next does not work correctly for 4-byte sequences.
void id_1538338() void id_1538338()
{ {
char* four_bytes = "\xf0\x90\x8d\x86"; const char* four_bytes = "\xf0\x90\x8d\x86";
char* it = four_bytes; const char* it = four_bytes;
int cp = unchecked::next(it); int cp = unchecked::next(it);
check (cp == 0x10346); check (cp == 0x10346);
} }

View file

@ -0,0 +1,46 @@
#include "../../../source/utf8.h"
using namespace utf8;
// [ 2852872 ] invalid utf16 strings were parsed without any error
void id_2852872()
{
const unsigned short two_trail_surrogates[] = {0xdd00, 0xdd01, 0};
vector<char> utf8_result;
try
{
utf8::utf16to8(two_trail_surrogates, two_trail_surrogates+2, back_inserter(utf8_result));
// should throw in the previous line and never get here
check(false);
}
catch(utf8::invalid_utf16&)
{
// this is what we expect
}
catch(...)
{
// an unexpected exception happened
check(false);
}
}
// [ 2857454 ] dereference invalid iterator when lead surrogate was last element of the string
void id_2857454()
{
const unsigned short lead_surrogate_last[] = {0x65, 0xd800, 0};
vector<char> utf8_result;
try
{
utf8::utf16to8(lead_surrogate_last, lead_surrogate_last + 2, back_inserter(utf8_result));
// should throw in the previous line and never get here
check(false);
}
catch(utf8::invalid_utf16&)
{
// this is what we expect
}
catch(...)
{
// an unexpected exception happened
check(false);
}
}

View file

@ -1,4 +1,5 @@
#include <iostream> #include <iostream>
#include <vector>
using namespace std; using namespace std;
inline void check_impl (bool condition, const char* file, int line) inline void check_impl (bool condition, const char* file, int line)
@ -18,6 +19,8 @@ inline void check_impl (bool condition, const char* file, int line)
// Release 1.0 Beta 3 // Release 1.0 Beta 3
#include "r1_0Beta3/basic_functionality.h" #include "r1_0Beta3/basic_functionality.h"
// Release 2.2.2
#include "r2_2_2/basic_functionality.h"
int main() int main()
@ -37,4 +40,10 @@ int main()
// Release 1.0 Beta 3 // Release 1.0 Beta 3
//r1_0Beta3/basic_functionality.h //r1_0Beta3/basic_functionality.h
id_1538338(); id_1538338();
// Release 2.2.2
//r2_2_2/basic_functionality.h
id_2852872();
id_2857454();
} }

View file

@ -33,7 +33,7 @@ print REPORT "\n";
print REPORT "==================Negative Test ==================\n"; print REPORT "==================Negative Test ==================\n";
close($report_name); close($report_name);
chdir 'negative'; chdir 'negative';
`./negative >> ../$report_name`; `./negative ../../test_data/negative/utf8_invalid.txt >> ../$report_name`;
chdir '..'; chdir '..';
die if !open(REPORT, ">>$report_name"); die if !open(REPORT, ">>$report_name");
print REPORT "==================End of negative test==================\n"; print REPORT "==================End of negative test==================\n";

View file

@ -0,0 +1,5 @@
CC = g++
CFLAGS = -g -Wall
smoketest: test.cpp ../../source/utf8.h ../../source/utf8/core.h ../../source/utf8/checked.h ../../source/utf8/unchecked.h
$(CC) $(CFLAGS) test.cpp -osmoketest

View file

@ -1,7 +1,6 @@
#include <cstring>
#include <cassert> #include <cassert>
#include <vector> #include <vector>
#include <iterator>
#include "../../source/utf8.h" #include "../../source/utf8.h"
using namespace utf8; using namespace utf8;
using namespace std; using namespace std;
@ -11,26 +10,27 @@ int main()
//append //append
unsigned char u[5] = {0,0,0,0,0}; unsigned char u[5] = {0,0,0,0,0};
unsigned char* end = append(0x0448, u); append(0x0448, u);
assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
end = append(0x65e5, u); append(0x65e5, u);
assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
end = append(0x3044, u); append(0x3044, u);
assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0); assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
end = append(0x10346, u); append(0x10346, u);
assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
//next //next
char* twochars = "\xe6\x97\xa5\xd1\x88"; const char* twochars = "\xe6\x97\xa5\xd1\x88";
char* w = twochars; const char* w = twochars;
int cp = next(w, twochars + 6); int cp = next(w, twochars + 6);
assert (cp == 0x65e5); assert (cp == 0x65e5);
assert (w == twochars + 3); assert (w == twochars + 3);
char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
w = threechars; w = threechars;
cp = next(w, threechars + 9); cp = next(w, threechars + 9);
assert (cp == 0x10346); assert (cp == 0x10346);
@ -42,8 +42,30 @@ int main()
assert (cp == 0x0448); assert (cp == 0x0448);
assert (w == threechars + 9); assert (w == threechars + 9);
//peek_next
const char* const cw = twochars;
cp = peek_next(cw, cw + 6);
assert (cp == 0x65e5);
assert (cw == twochars);
//previous //prior
w = twochars + 3;
cp = prior (w, twochars);
assert (cp == 0x65e5);
assert (w == twochars);
w = threechars + 9;
cp = prior(w, threechars);
assert (cp == 0x0448);
assert (w == threechars + 7);
cp = prior(w, threechars);
assert (cp == 0x65e5);
assert (w == threechars + 4);
cp = prior(w, threechars);
assert (cp == 0x10346);
assert (w == threechars);
//previous (deprecated)
w = twochars + 3; w = twochars + 3;
cp = previous (w, twochars - 1); cp = previous (w, twochars - 1);
assert (cp == 0x65e5); assert (cp == 0x65e5);
@ -117,10 +139,43 @@ int main()
bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
assert (bvalid == true); assert (bvalid == true);
//is_bom //starts_with_bom
unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
bool bbom = is_bom(byte_order_mark); bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
assert (bbom == true); assert (bbom == true);
bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
assert (no_bbom == false);
//is_bom
bool unsafe_bbom = is_bom(byte_order_mark);
assert (unsafe_bbom == true);
//replace_invalid
char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
vector<char> replace_invalid_result(50);
replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), replace_invalid_result.begin(), '?');
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
const char* fixed_invalid_sequence = "a????z";
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
// iterator
utf8::iterator<const char*> it(threechars, threechars, threechars + 9);
utf8::iterator<const char*> it2 = it;
assert (it2 == it);
assert (*it == 0x10346);
assert (*(++it) == 0x65e5);
assert ((*it++) == 0x65e5);
assert (*it == 0x0448);
assert (it != it2);
utf8::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);
assert (++it == endit);
assert (*(--it) == 0x0448);
assert ((*it--) == 0x0448);
assert (*it == 0x65e5);
assert (--it == utf8::iterator<const char*>(threechars, threechars, threechars + 9));
assert (*it == 0x10346);
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
//// Unchecked variants //// Unchecked variants
@ -128,13 +183,13 @@ int main()
//append //append
memset(u, 0, 5); memset(u, 0, 5);
end = unchecked::append(0x0448, u); append(0x0448, u);
assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
end = unchecked::append(0x65e5, u); append(0x65e5, u);
assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0); assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
end = unchecked::append(0x10346, u); append(0x10346, u);
assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0); assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
//next //next
@ -154,8 +209,14 @@ int main()
assert (cp == 0x0448); assert (cp == 0x0448);
assert (w == threechars + 9); assert (w == threechars + 9);
//peek_next
cp = unchecked::peek_next(cw);
assert (cp == 0x65e5);
assert (cw == twochars);
//previous (calls prior internally)
//previous
w = twochars + 3; w = twochars + 3;
cp = unchecked::previous (w); cp = unchecked::previous (w);
assert (cp == 0x65e5); assert (cp == 0x65e5);
@ -214,6 +275,23 @@ int main()
// try it with the return value; // try it with the return value;
utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]); utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
assert (utf16_end == &utf16result[0] + 4); assert (utf16_end == &utf16result[0] + 4);
// iterator
utf8::unchecked::iterator<const char*> un_it(threechars);
utf8::unchecked::iterator<const char*> un_it2 = un_it;
assert (un_it2 == un_it);
assert (*un_it == 0x10346);
assert (*(++un_it) == 0x65e5);
assert ((*un_it++) == 0x65e5);
assert (un_it != un_it2);
assert (*un_it == 0x0448);
utf8::unchecked::iterator<const char*> un_endit (threechars + 9);
assert (++un_it == un_endit);
assert (*(--un_it) == 0x0448);
assert ((*un_it--) == 0x0448);
assert (*un_it == 0x65e5);
assert (--un_it == utf8::unchecked::iterator<const char*>(threechars));
assert (*un_it == 0x10346);
} }

View file

@ -0,0 +1,5 @@
CC = g++
CFLAGS = -g -Wall -pedantic
utf8readertest: utf8reader.cpp ../../source/utf8.h ../../source/utf8/core.h ../../source/utf8/checked.h ../../source/utf8/unchecked.h
$(CC) $(CFLAGS) utf8reader.cpp -o utf8reader

View file

@ -59,7 +59,9 @@ int main(int argc, char** argv)
unsigned char_count = 0; unsigned char_count = 0;
string::iterator it = line_start; string::iterator it = line_start;
while (it != line_end) { while (it != line_end) {
next(it, line_end); unsigned int next_cp = peek_next(it, line_end);
if (next(it, line_end) != next_cp)
cout << "Line " << line_count << ": Error: peek_next gave a different result than next" << '\n';
char_count++; char_count++;
} }
if (char_count != utf32_line.size()) if (char_count != utf32_line.size())
@ -80,6 +82,18 @@ int main(int argc, char** argv)
if (char_count != 0) if (char_count != 0)
cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n'; cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
// Try utf8::iterator
utf8::iterator<string::iterator> u8it(line_start, line_start, line_end);
if (!utf32_line.empty() && *u8it != utf32_line.at(0))
cout << "Line " << line_count << ": Error in utf::iterator * operator" << '\n';
if (std::distance(u8it, utf8::iterator<string::iterator>(line_end, line_start, line_end)) != static_cast<int>(utf32_line.size()))
cout << "Line " << line_count << ": Error in using utf::iterator with std::distance - wrong number of characters" << '\n';
std::advance(u8it, utf32_line.size());
if (u8it != utf8::iterator<string::iterator>(line_end, line_start, line_end))
cout << "Line " << line_count << ": Error in using utf::iterator with std::advance" << '\n';
//======================== Now, the unchecked versions ====================== //======================== Now, the unchecked versions ======================
// Convert it to utf-16 and compare to the checked version // Convert it to utf-16 and compare to the checked version
vector<unsigned short> utf16_line_unchecked; vector<unsigned short> utf16_line_unchecked;
@ -109,7 +123,9 @@ int main(int argc, char** argv)
char_count = 0; char_count = 0;
it = line_start; it = line_start;
while (it != line_end) { while (it != line_end) {
unchecked::next(it); unsigned int next_cp = unchecked::peek_next(it);
if (unchecked::next(it) != next_cp)
cout << "Line " << line_count << ": Error: unchecked::peek_next gave a different result than unchecked::next" << '\n';;
char_count++; char_count++;
} }
if (char_count != utf32_line.size()) if (char_count != utf32_line.size())
@ -130,5 +146,15 @@ int main(int argc, char** argv)
if (char_count != 0) if (char_count != 0)
cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n'; cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
// Try utf8::unchecked::iterator
utf8::unchecked::iterator<string::iterator> un_u8it(line_start);
if (!utf32_line.empty() && *un_u8it != utf32_line.at(0))
cout << "Line " << line_count << ": Error in utf::unchecked::iterator * operator" << '\n';
if (std::distance(un_u8it, utf8::unchecked::iterator<string::iterator>(line_end)) != static_cast<int>(utf32_line.size()))
cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::distance - wrong number of characters" << '\n';
std::advance(un_u8it, utf32_line.size());
if (un_u8it != utf8::unchecked::iterator<string::iterator>(line_end))
cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::advance" << '\n';
} }
} }

147
v3_0/src/utf8.h Normal file
View file

@ -0,0 +1,147 @@
// Copyright 2006-2013 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
// By default, utf8 cpp requires C++ Standard Library strings and exceptions
// The following macros can be used to change the default behavior
// #define UTF_CPP_NO_STD_STRING
// #define UTF_CPP_NO_EXCEPTIONS
#ifndef UTF_CPP_NO_EXCEPTIONS
#include <stdexcept>
#ifndef UTF_CPP_NO_STD_STRING
#include <string>
#include <iterator>
#endif // #ifndef UTF_CPP_NO_STD_STRING
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
namespace utf8
{
// Error codes - used internally and if exceptions disabled
enum class utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD,
INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
#ifndef UTF_CPP_NO_EXCEPTIONS
// Base for the exceptions that may be thrown from the library
class exception : public ::std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public exception {
char32_t cp;
public:
invalid_code_point(char32_t cp) : cp(cp) {}
virtual const char* what() const noexcept { return "Invalid code point"; }
char32_t code_point() const {return cp;}
};
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const char32_t LEAD_SURROGATE_MIN = 0x0000d800;
const char32_t LEAD_SURROGATE_MAX = 0x0000dbff;
const char32_t TRAIL_SURROGATE_MIN = 0x0000dc00;
const char32_t TRAIL_SURROGATE_MAX = 0x0000dfff;
// Maximum valid value for a Unicode code point
const char32_t CODE_POINT_MAX = 0x0010ffff;
inline bool is_surrogate(char32_t cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
inline bool is_code_point_valid(char32_t cp)
{
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
} // namespace internal
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(char32_t cp, octet_iterator result, utf_error& error)
{
if (!utf8::internal::is_code_point_valid(cp)) {
error = utf8::utf_error::INVALID_CODE_POINT;
return result;
}
if (cp < 0x80) // one octet
*(result++) = static_cast<char>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<char>((cp >> 6) | 0xc0);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<char>((cp >> 12) | 0xe0);
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<char>((cp >> 18) | 0xf0);
*(result++) = static_cast<char>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<char>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<char>((cp & 0x3f) | 0x80);
}
return result;
}
#ifndef UTF_CPP_NO_EXCEPTIONS
template <typename octet_iterator>
octet_iterator append(char32_t cp, octet_iterator result)
{
utf8::utf_error err {utf8::utf_error::UTF8_OK};
utf8::append(cp, result, err);
if (err != utf8::utf_error::UTF8_OK)
throw utf8::invalid_code_point(cp);
return result;
}
#ifndef UTF_CPP_NO_STD_STRING
inline void append(char32_t cp, std::string& str)
{
utf8::append(cp, std::back_inserter(str));
}
#endif // #ifndef UTF_CPP_NO_STD_STRING
#endif // #ifndef UTF_CPP_NO_EXCEPTIONS
} // namespace utf8
#endif // header guard

6
v3_0/tests/Makefile Normal file
View file

@ -0,0 +1,6 @@
CC = g++
CFLAGS = -g -Wall --std=c++11
smoketest: unit.cpp ../src/utf8.h
$(CC) $(CFLAGS) unit.cpp -ounit -lboost_unit_test_framework
./unit

19
v3_0/tests/unit.cpp Normal file
View file

@ -0,0 +1,19 @@
#define BOOST_TEST_DYN_LINK
#define BOOST_TEST_MODULE UTF8_CPP_UNIT
#include <boost/test/unit_test.hpp>
#include "../src/utf8.h"
using namespace std;
BOOST_AUTO_TEST_CASE(append)
{
string s;
BOOST_CHECK_NO_THROW (utf8::append(U'\U00000448', s));
BOOST_CHECK (s.length() == 2 && s[0] == '\xd1' && s[1] == '\x88');
s.erase();
BOOST_CHECK_NO_THROW(utf8::append(U'\U000065e5', s));
BOOST_CHECK (s.length() == 3 && s[0] == '\xe6' && s[1] == '\x97' && s[2] == '\xa5');
}