diff --git a/v2_1/buildrelease.pl b/v2_1/buildrelease.pl
new file mode 100755
index 0000000..ed3a17b
--- /dev/null
+++ b/v2_1/buildrelease.pl
@@ -0,0 +1,18 @@
+#! /usr/bin/perl
+
+$release_files = 'source/utf8.h source/utf8/core.h source/utf8/checked.h source/utf8/unchecked.h doc/utf8cpp.html doc/ReleaseNotes';
+
+# First get the latest version
+`svn update`;
+
+# Then construct the name of the zip file
+$argc = @ARGV;
+if ($argc > 0) {
+ $zip_name = $ARGV[0];
+}
+else {
+ $zip_name = "utf8";
+}
+
+# Zip the files to an archive
+`zip $zip_name $release_files`;
diff --git a/v2_1/doc/ReleaseNotes b/v2_1/doc/ReleaseNotes
new file mode 100644
index 0000000..c061666
--- /dev/null
+++ b/v2_1/doc/ReleaseNotes
@@ -0,0 +1,18 @@
+utf8 cpp library
+Release 2.0
+
+This is the first production release of the version 2.0 of the library. Please note that the branch 1.x is still being developed, and will be maintained in the future.
+
+Changes from version 2.o Beta 1:
+- A minor fix in the utf::iterator
+- Updated documentation
+
+
+Changes from version 1.02:
+
+- Feature request [1569543]: Add utf8 iterator adapter class
+- Feature request [1569479]: Add replace_invalid function
+- Reorganized the source code: rather than a single header file, the library now resides in three separate ones. To maintain the backward compatibility, there is also a convenience header utf8.h that includes all the other ones.
+- The requironment for octet_iterator is bidirectional, rather than random access, although in practice it probably doesn't mean a lot.
+
+Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
diff --git a/v2_1/doc/utf8cpp.html b/v2_1/doc/utf8cpp.html
new file mode 100644
index 0000000..63e9afd
--- /dev/null
+++ b/v2_1/doc/utf8cpp.html
@@ -0,0 +1,1498 @@
+
+
+
+
+
+
+
+
+ UTF8-CPP: UTF-8 with C++ in a Portable Way
+
+
+
+
+
+ UTF8-CPP: UTF-8 with C++ in a Portable Way
+
+
+ The Sourceforge project page
+
+
+
+ Table of Contents
+
+
+
+
+ Introduction
+
+
+ Many C++ developers miss an easy and portable way of handling Unicode encoded
+ strings. C++ Standard is currently Unicode agnostic, and while some work is being
+ done to introduce Unicode to the next incarnation called C++0x, for the moment
+ nothing of the sort is available. In the meantime, developers use 3rd party
+ libraries like ICU, OS specific capabilities, or simply roll out their own
+ solutions.
+
+
+ In order to easily handle UTF-8 encoded Unicode strings, I have come up with a small
+ generic library. For anybody used to work with STL algorithms and iterators, it should be
+ easy and natural to use. The code is freely available for any purpose - check out
+ the license at the beginning of the utf8.h file. If you run into
+ bugs or performance issues, please let me know and I'll do my best to address them.
+
+
+ The purpose of this article is not to offer an introduction to Unicode in general,
+ and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out
+ Unicode Home Page or some other source of
+ information for Unicode. Also, it is not my aim to advocate the use of UTF-8
+ encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from
+ C++, I am sure you have good reasons for it.
+
+
+ Examples of use
+
+
+ To illustrate the use of this utf8 library, we shall open a file containing UTF-8
+ encoded text, check whether it starts with a byte order mark, read each line into a
+ std::string
, check it for validity, convert the text to UTF-16, and
+ back to UTF-8:
+
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "utf8.h"
+using namespace std;
+int main()
+{
+ if (argc != 2) {
+ cout << "\nUsage: docsample filename\n";
+ return 0;
+ }
+ const char* test_file_path = argv[1];
+
+ ifstream fs8(test_file_path);
+ if (!fs8.is_open()) {
+ cout << "Could not open " << test_file_path << endl;
+ return 0;
+ }
+
+ unsigned line_count = 1;
+ string line;
+ if (!getline(fs8, line))
+ return 0;
+
+ if (line.size() > 2) {
+ if (utf8::is_bom(line.c_str()))
+ cout << "There is a byte order mark at the beginning of the file\n";
+ }
+
+ do {
+
+ string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+ if (end_it != line.end()) {
+ cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+ cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+ }
+
+ int length = utf8::distance(line.begin(), end_it);
+ cout << "Length of line " << line_count << " is " << length << "\n";
+
+ vector<unsigned short> utf16line;
+ utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+
+ string utf8line;
+ utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+
+ if (utf8line != string(line.begin(), end_it))
+ cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
+ getline(fs8, line);
+ line_count++;
+ } while (!fs8.eof());
+ return 0;
+}
+
+
+ In the previous code sample, we have seen the use of the following functions from
+ utf8
namespace: first we used is_bom
function to detect
+ UTF-8 byte order mark at the beginning of the file; then for each line we performed
+ a detection of invalid UTF-8 sequences with find_invalid
; the number
+ of characters (more precisely - the number of Unicode code points) in each line was
+ determined with a use of utf8::distance
; finally, we have converted
+ each line to UTF-16 encoding with utf8to16
and back to UTF-8 with
+ utf16to8
.
+
+
+ Reference
+
+
+ Functions From utf8 Namespace
+
+
+ utf8::append
+
+
+ Available in version 1.0 and later.
+
+
+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
+ to a UTF-8 string.
+
+
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+
+
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
+
+
+ Example of use:
+
+
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+
+ Note that append
does not allocate any memory - it is the burden of
+ the caller to make sure there is enough memory allocated for the operation. To make
+ things more interesting, append
can add anywhere between 1 and 4
+ octets to the sequence. In practice, you would most often want to use
+ std::back_inserter
to ensure that the necessary memory is allocated.
+
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+
+ utf8::next
+
+
+ Available in version 1.0 and later.
+
+
+ Given the iterator to the beginning of the UTF-8 sequence, it returns the code
+ point and moves the iterator to the next position.
+
+
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it, octet_iterator end);
+
+
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = next(w, twochars + 6);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+
+ This function is typically used to iterate through a UTF-8 encoded string.
+
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown.
+
+
+ utf8::prior
+
+
+ Available in version 1.02 and later.
+
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ decreases the iterator until it hits the beginning of the previous UTF-8 encoded
+ code point and returns the 32 bits representation of the code point.
+
+
+template <typename octet_iterator>
+uint32_t prior(octet_iterator& it, octet_iterator start);
+
+
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ start
: an iterator to the beginning of the sequence where the search
+ for the beginning of a code point is performed. It is a
+ safety measure to prevent passing the beginning of the string in the search for a
+ UTF-8 lead octet.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = prior (w, twochars);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+
+ This function has two purposes: one is two iterate backwards through a UTF-8
+ encoded string. Note that it is usually a better idea to iterate forward instead,
+ since utf8::next
is faster. The second purpose is to find a beginning
+ of a UTF-8 sequence if we have a random position within a string.
+
+
+ it
will typically point to the beginning of
+ a code point, and start
will point to the
+ beginning of the string to ensure we don't go backwards too far. it
is
+ decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence
+ beginning with that octet is decoded to a 32 bit representation and returned.
+
+
+ In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
+ exception is thrown.
+
+
+ utf8::previous
+
+
+ Deprecated in version 1.02 and later.
+
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ decreases the iterator until it hits the beginning of the previous UTF-8 encoded
+ code point and returns the 32 bits representation of the code point.
+
+
+template <typename octet_iterator>
+uint32_t previous(octet_iterator& it, octet_iterator pass_start);
+
+
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ pass_start
: an iterator to the point in the sequence where the search
+ for the beginning of a code point is aborted if no result was reached. It is a
+ safety measure to prevent passing the beginning of the string in the search for a
+ UTF-8 lead octet.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars + 3;
+int cp = previous (w, twochars - 1);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+
+ utf8::previous
is deprecated, and utf8::prior
should
+ be used instead, although the existing code can continue using this function.
+ The problem is the parameter pass_start
that points to the position
+ just before the beginning of the sequence. Standard containers don't have the
+ concept of "pass start" and the function can not be used with their iterators.
+
+
+ it
will typically point to the beginning of
+ a code point, and pass_start
will point to the octet just before the
+ beginning of the string to ensure we don't go backwards too far. it
is
+ decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence
+ beginning with that octet is decoded to a 32 bit representation and returned.
+
+
+ In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
+ exception is thrown
+
+
+ utf8::advance
+
+
+ Available in version 1.0 and later.
+
+
+ Advances an iterator by the specified number of code points within an UTF-8
+ sequence.
+
+
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n, octet_iterator end);
+
+
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+ end
: end of the UTF-8 sequence to be processed. If it
+ gets equal to end
during the extraction of a code point, an
+ utf8::not_enough_room
exception is thrown.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+unsigned char* w = twochars;
+advance (w, 2, twochars + 6);
+assert (w == twochars + 5);
+
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+
+ In case of an invalid code point, a utf8::invalid_code_point
exception
+ is thrown.
+
+
+ utf8::distance
+
+
+ Available in version 1.0 and later.
+
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+ number of code points between them.
+
+
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+
+
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::distance(twochars, twochars + 5);
+assert (dist == 2);
+
+
+ This function is used to find the length (in code points) of a UTF-8 encoded
+ string. The reason it is called distance, rather than, say,
+ length is mainly because developers are used that length is an
+ O(1) function. Computing the length of an UTF-8 string is a linear operation, and
+ it looked better to model it after std::distance
algorithm.
+
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If last
does not point to the past-of-end of a UTF-8 seqence,
+ a utf8::not_enough_room
exception is thrown.
+
+
+ utf8::utf16to8
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-16 encoded string to UTF-8.
+
+
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+
+ Example of use:
+
+
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);
+
+
+ In case of invalid UTF-16 sequence, a utf8::invalid_utf16
exception is
+ thrown.
+
+
+ utf8::utf8to16
+
+
+ Available in version 1.0 and later.
+
+
+ Converts an UTF-8 encoded string to UTF-16
+
+
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
+
+
+ Example of use:
+
+
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+
+ utf8::utf32to8
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-32 encoded string to UTF-8.
+
+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+
+ Example of use:
+
+
+int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+
+ In case of invalid UTF-32 string, a utf8::invalid_code_point
exception
+ is thrown.
+
+
+ utf8::utf8to32
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-8 encoded string to UTF-32.
+
+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+
+ In case of an invalid UTF-8 seqence, a utf8::invalid_utf8
exception is
+ thrown. If end
does not point to the past-of-end of a UTF-8 seqence, a
+ utf8::not_enough_room
exception is thrown.
+
+
+ utf8::find_invalid
+
+
+ Available in version 1.0 and later.
+
+
+ Detects an invalid sequence within a UTF-8 string.
+
+
+template <typename octet_iterator>
+octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: an iterator pointing to the first
+ invalid octet in the UTF-8 string. In case none were found, equals
+ end
.
+
+
+ Example of use:
+
+
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+assert (invalid == utf_invalid + 5);
+
+
+ This function is typically used to make sure a UTF-8 string is valid before
+ processing it with other functions. It is especially important to call it if before
+ doing any of the unchecked operations on it.
+
+
+ utf8::is_valid
+
+
+ Available in version 1.0 and later.
+
+
+ Checks whether a sequence of octets is a valid UTF-8 string.
+
+
+template <typename octet_iterator>
+bool is_valid(octet_iterator start, octet_iterator end);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ test for validity.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to test
+ for validity.
+ Return value: true
if the sequence
+ is a valid UTF-8 string; false
if not.
+
+ Example of use:
+
+char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+assert (bvalid == false);
+
+
+ is_valid
is a shorthand for find_invalid(start, end) ==
+ end;
. You may want to use it to make sure that a byte seqence is a valid
+ UTF-8 string without the need to know where it fails if it is not valid.
+
+
+ utf8::replace_invalid
+
+
+ Available in version 2.0 and later.
+
+
+ Replaces all invalid UTF-8 sequences within a string with a replacement marker.
+
+
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement);
+template <typename octet_iterator, typename output_iterator>
+output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 string to
+ look for invalid UTF-8 sequences.
+ end
: an iterator pointing to pass-the-end of the UTF-8 string to look
+ for invalid UTF-8 sequences.
+ out
: An output iterator to the range where the result of replacement
+ is stored.
+ replacement
: A Unicode code point for the replacement marker. The
+ version without this parameter assumes the value 0xfffd
+ Return value: An iterator pointing to the place
+ after the UTF-8 string with replaced invalid sequences.
+
+
+ Example of use:
+
+
+char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+vector<char> replace_invalid_result;
+replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?');
+bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+assert (bvalid);
+char* fixed_invalid_sequence = "a????z";
+assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));
+
+
+ replace_invalid
does not perform in-place replacement of invalid
+ sequences. Rather, it produces a copy of the original string with the invalid
+ sequences replaced with a replacement marker. Therefore, out
must not
+ be in the [start, end]
range.
+
+
+ If end
does not point to the past-of-end of a UTF-8 sequence, a
+ utf8::not_enough_room
exception is thrown.
+
+
+ utf8::is_bom
+
+
+ Available in version 1.0 and later.
+
+
+ Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
+
+
+template <typename octet_iterator>
+bool is_bom (octet_iterator it);
+
+
+ it
: beginning of the 3-octet sequence to check
+ Return value: true
if the sequence
+ is UTF-8 byte order mark; false
if not.
+
+
+ Example of use:
+
+
+unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+bool bbom = is_bom(byte_order_mark);
+assert (bbom == true);
+
+
+ The typical use of this function is to check the first three bytes of a file. If
+ they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
+ encoded text.
+
+
+ Types From utf8 Namespace
+
+
+ utf8::iterator
+
+
+ Available in version 2.0 and later.
+
+
+ Adapts the underlying octet iterator to iterate over the sequence of code points,
+ rather than raw octets.
+
+
+template <typename octet_iterator>
+class iterator;
+
+
+ Member functions
+
+ iterator();
- the deafult constructor; the underlying
octet_iterator
is
+ constructed with its default constructor.
+ explicit iterator (const octet_iterator& octet_it,
+ const octet_iterator& range_start,
+ const octet_iterator& range_end);
- a constructor
+ that initializes the underlying
octet_iterator
with octet_it
+ and sets the range in which the iterator is considered valid.
+ octet_iterator base () const;
- returns the
+ underlying
octet_iterator
.
+ uint32_t operator * () const;
- decodes the utf-8 sequence
+ the underlying
octet_iterator
is pointing to and returns the code point.
+ bool operator == (const iterator& rhs)
+ const;
- returns true
+ if the two underlaying iterators are equal.
+
bool operator != (const iterator& rhs)
+ const;
- returns true
+ if the two underlaying iterators are not equal.
+
iterator& operator ++ ();
- the prefix increment - moves
+ the iterator to the next UTF-8 encoded code point.
+
iterator operator ++ (int);
-
+ the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one.
+
iterator& operator -- ();
- the prefix decrement - moves
+ the iterator to the previous UTF-8 encoded code point.
+
iterator operator -- (int);
-
+ the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one.
+
+
+ Example of use:
+
+
+char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+utf8::iterator<char*> it(threechars, threechars, threechars + 9);
+utf8::iterator<char*> it2 = it;
+assert (it2 == it);
+assert (*it == 0x10346);
+assert (*(++it) == 0x65e5);
+assert ((*it++) == 0x65e5);
+assert (*it == 0x0448);
+assert (it != it2);
+utf8::iterator<char*> endit (threechars + 9, threechars, threechars + 9);
+assert (++it == endit);
+assert (*(--it) == 0x0448);
+assert ((*it--) == 0x0448);
+assert (*it == 0x65e5);
+assert (--it == utf8::iterator<char*>(threechars, threechars, threechars + 9));
+assert (*it == 0x10346);
+
+
+ The purpose of utf8::iterator
adapter is to enable easy iteration as well as the use of STL
+ algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of
+ utf8::next()
and utf8::prior()
functions.
+
+
+ Note that utf8::iterator
adapter is a checked iterator. It operates on the range specified in
+ the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators
+ require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically,
+ the range will be determined by sequence container functions begin
and end
, i.e.:
+
+
+std::string s = "example";
+utf8::iterator i (s.begin(), s.begin(), s.end());
+
+
+ Functions From utf8::unchecked Namespace
+
+
+ utf8::unchecked::append
+
+
+ Available in version 1.0 and later.
+
+
+ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence
+ to a UTF-8 string.
+
+
+template <typename octet_iterator>
+octet_iterator append(uint32_t cp, octet_iterator result);
+
+
+
+ cp
: A 32 bit integer representing a code point to append to the
+ sequence.
+ result
: An output iterator to the place in the sequence where to
+ append the code point.
+ Return value: An iterator pointing to the place
+ after the newly appended sequence.
+
+
+ Example of use:
+
+
+unsigned char u[5] = {0,0,0,0,0};
+unsigned char* end = unchecked::append(0x0448, u);
+assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+
+ This is a faster but less safe version of utf8::append
. It does not
+ check for validity of the supplied code point, and may produce an invalid UTF-8
+ sequence.
+
+
+ utf8::unchecked::next
+
+
+ Available in version 1.0 and later.
+
+
+ Given the iterator to the beginning of a UTF-8 sequence, it returns the code point
+ and moves the iterator to the next position.
+
+
+template <typename octet_iterator>
+uint32_t next(octet_iterator& it);
+
+
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ beginning of the next code point.
+ Return value: the 32 bit representation of the
+ processed UTF-8 code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+int cp = unchecked::next(w);
+assert (cp == 0x65e5);
+assert (w == twochars + 3);
+
+
+ This is a faster but less safe version of utf8::next
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+
+ utf8::unchecked::prior
+
+
+ Available in version 1.02 and later.
+
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ decreases the iterator until it hits the beginning of the previous UTF-8 encoded
+ code point and returns the 32 bits representation of the code point.
+
+
+template <typename octet_iterator>
+uint32_t prior(octet_iterator& it);
+
+
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::prior (w);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+
+ This is a faster but less safe version of utf8::prior
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+
+ utf8::unchecked::previous (deprecated, see utf8::unchecked::prior)
+
+
+ Deprecated in version 1.02 and later.
+
+
+ Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ decreases the iterator until it hits the beginning of the previous UTF-8 encoded
+ code point and returns the 32 bits representation of the code point.
+
+
+template <typename octet_iterator>
+uint32_t previous(octet_iterator& it);
+
+
+
+ it
: a reference pointing to an octet within a UTF-8 encoded string.
+ After the function returns, it is decremented to point to the beginning of the
+ previous code point.
+ Return value: the 32 bit representation of the
+ previous code point.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars + 3;
+int cp = unchecked::previous (w);
+assert (cp == 0x65e5);
+assert (w == twochars);
+
+
+ The reason this function is deprecated is just the consistency with the "checked"
+ versions, where prior
should be used instead of previous
.
+ In fact, unchecked::previous
behaves exactly the same as
+ unchecked::prior
+
+
+ This is a faster but less safe version of utf8::previous
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+
+ utf8::unchecked::advance
+
+
+ Available in version 1.0 and later.
+
+
+ Advances an iterator by the specified number of code points within an UTF-8
+ sequence.
+
+
+template <typename octet_iterator, typename distance_type>
+void advance (octet_iterator& it, distance_type n);
+
+
+
+ it
: a reference to an iterator pointing to the beginning of an UTF-8
+ encoded code point. After the function returns, it is incremented to point to the
+ nth following code point.
+ n
: a positive integer that shows how many code points we want to
+ advance.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+char* w = twochars;
+unchecked::advance (w, 2);
+assert (w == twochars + 5);
+
+
+ This function works only "forward". In case of a negative n
, there is
+ no effect.
+
+
+ This is a faster but less safe version of utf8::advance
. It does not
+ check for validity of the supplied UTF-8 sequence and offers no boundary checking.
+
+
+ utf8::unchecked::distance
+
+
+ Available in version 1.0 and later.
+
+
+ Given the iterators to two UTF-8 encoded code points in a seqence, returns the
+ number of code points between them.
+
+
+template <typename octet_iterator>
+typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);
+
+
+ first
: an iterator to a beginning of a UTF-8 encoded code point.
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ point in the sequence we are trying to determine the length. It can be the
+ beginning of a new code point, or not.
+ Return value the distance between the iterators,
+ in code points.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+size_t dist = utf8::unchecked::distance(twochars, twochars + 5);
+assert (dist == 2);
+
+
+ This is a faster but less safe version of utf8::distance
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+
+ utf8::unchecked::utf16to8
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-16 encoded string to UTF-8.
+
+
+template <typename u16bit_iterator, typename octet_iterator>
+octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-16 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-16 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+
+ Example of use:
+
+
+unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+vector<unsigned char> utf8result;
+unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+assert (utf8result.size() == 10);
+
+
+ This is a faster but less safe version of utf8::utf16to8
. It does not
+ check for validity of the supplied UTF-16 sequence.
+
+
+ utf8::unchecked::utf8to16
+
+
+ Available in version 1.0 and later.
+
+
+ Converts an UTF-8 encoded string to UTF-16
+
+
+template <typename u16bit_iterator, typename octet_iterator>
+u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert. < br /> end
: an iterator pointing to
+ pass-the-end of the UTF-8 encoded string to convert.
+ result
: an output iterator to the place in the UTF-16 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-16 string.
+
+
+ Example of use:
+
+
+char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+vector <unsigned short> utf16result;
+unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+assert (utf16result.size() == 4);
+assert (utf16result[2] == 0xd834);
+assert (utf16result[3] == 0xdd1e);
+
+
+ This is a faster but less safe version of utf8::utf8to16
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+
+ utf8::unchecked::utf32to8
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-32 encoded string to UTF-8.
+
+
+template <typename octet_iterator, typename u32bit_iterator>
+octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-32 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-32 encoded
+ string to convert.
+ result
: an output iterator to the place in the UTF-8 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-8 string.
+
+
+ Example of use:
+
+
+int utf32string[] = {0x448, 0x65e5, 0x10346, 0};
+vector<unsigned char> utf8result;
+utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+assert (utf8result.size() == 9);
+
+
+ This is a faster but less safe version of utf8::utf32to8
. It does not
+ check for validity of the supplied UTF-32 sequence.
+
+
+ utf8::unchecked::utf8to32
+
+
+ Available in version 1.0 and later.
+
+
+ Converts a UTF-8 encoded string to UTF-32.
+
+
+template <typename octet_iterator, typename u32bit_iterator>
+u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);
+
+
+
+ start
: an iterator pointing to the beginning of the UTF-8 encoded
+ string to convert.
+ end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
+ to convert.
+ result
: an output iterator to the place in the UTF-32 string where to
+ append the result of conversion.
+ Return value: An iterator pointing to the place
+ after the appended UTF-32 string.
+
+
+ Example of use:
+
+
+char* twochars = "\xe6\x97\xa5\xd1\x88";
+vector<int> utf32result;
+unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+assert (utf32result.size() == 2);
+
+
+ This is a faster but less safe version of utf8::utf8to32
. It does not
+ check for validity of the supplied UTF-8 sequence.
+
+
+ Types From utf8::unchecked Namespace
+
+
+ utf8::iterator
+
+
+ Available in version 2.0 and later.
+
+
+ Adapts the underlying octet iterator to iterate over the sequence of code points,
+ rather than raw octets.
+
+
+template <typename octet_iterator>
+class iterator;
+
+
+ Member functions
+
+ iterator();
- the deafult constructor; the underlying
octet_iterator
is
+ constructed with its default constructor.
+ explicit iterator (const octet_iterator& octet_it);
+
- a constructor
+ that initializes the underlying
octet_iterator
with octet_it
+ octet_iterator base () const;
- returns the
+ underlying
octet_iterator
.
+ uint32_t operator * () const;
- decodes the utf-8 sequence
+ the underlying
octet_iterator
is pointing to and returns the code point.
+ bool operator == (const iterator& rhs)
+ const;
- returns true
+ if the two underlaying iterators are equal.
+
bool operator != (const iterator& rhs)
+ const;
- returns true
+ if the two underlaying iterators are not equal.
+
iterator& operator ++ ();
- the prefix increment - moves
+ the iterator to the next UTF-8 encoded code point.
+
iterator operator ++ (int);
-
+ the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one.
+
iterator& operator -- ();
- the prefix decrement - moves
+ the iterator to the previous UTF-8 encoded code point.
+
iterator operator -- (int);
-
+ the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one.
+
+
+ Example of use:
+
+
+char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+utf8::unchecked::iterator<char*> un_it(threechars);
+utf8::unchecked::iterator<char*> un_it2 = un_it;
+assert (un_it2 == un_it);
+assert (*un_it == 0x10346);
+assert (*(++un_it) == 0x65e5);
+assert ((*un_it++) == 0x65e5);
+assert (*un_it == 0x0448);
+assert (un_it != un_it2);
+utf8::::unchecked::iterator<char*> un_endit (threechars + 9);
+assert (++un_it == un_endit);
+assert (*(--un_it) == 0x0448);
+assert ((*un_it--) == 0x0448);
+assert (*un_it == 0x65e5);
+assert (--un_it == utf8::unchecked::iterator<char*>(threechars));
+assert (*un_it == 0x10346);
+
+
+ This is an unchecked version of utf8::iterator
. It is faster in many cases, but offers
+ no validity or range checks.
+
+
+ Points of interest
+
+
+ Design goals and decisions
+
+
+ The library was designed to be:
+
+
+ -
+ Generic: for better or worse, there are many C++ string classes out there, and
+ the library should work with as many of them as possible.
+
+ -
+ Portable: the library should be portable both accross different platforms and
+ compilers. The only non-portable code is a small section that declares unsigned
+ integers of different sizes: three typedefs. They can be changed by the users of
+ the library if they don't match their platform. The default setting should work
+ for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives.
+
+ -
+ Lightweight: follow the "pay only for what you use" guidline.
+
+ -
+ Unintrusive: avoid forcing any particular design or even programming style on the
+ user. This is a library, not a framework.
+
+
+
+ Alternatives
+
+
+ In case you want to look into other means of working with UTF-8 strings from C++,
+ here is the list of solutions I am aware of:
+
+
+ -
+ ICU Library. It is very powerful,
+ complete, feature-rich, mature, and widely used. Also big, intrusive,
+ non-generic, and doesn't play well with the Standard Library. I definitelly
+ recommend looking at ICU even if you don't plan to use it.
+
+ -
+ Glib::ustring.
+ A class specifically made to work with UTF-8 strings, and also feel like
+
std::string
. If you prefer to have yet another string class in your
+ code, it may be worth a look. Be aware of the licensing issues, though.
+
+ -
+ Platform dependent solutions: Windows and POSIX have functions to convert strings
+ from one encoding to another. That is only a subset of what my library offers,
+ but if that is all you need it may be good enough, especially given the fact that
+ these functions are mature and tested in production.
+
+
+
+ Conclusion
+
+
+ Until Unicode becomes officially recognized by the C++ Standard Library, we need to
+ use other means to work with UTF-8 strings. Template functions I describe in this
+ article may be a good step in this direction.
+
+
+ Links
+
+
+ -
+ The Unicode Consortium.
+
+ -
+ ICU Library.
+
+ -
+ UTF-8 at Wikipedia
+
+ -
+ UTF-8 and Unicode FAQ for
+ Unix/Linux
+
+
+
+
diff --git a/v2_1/samples/Makefile b/v2_1/samples/Makefile
new file mode 100644
index 0000000..6cdd3c8
--- /dev/null
+++ b/v2_1/samples/Makefile
@@ -0,0 +1,5 @@
+CC = g++
+CFLAGS = -g -Wall -pedantic
+
+docsample: docsample.cpp ../source/utf8.h
+ $(CC) $(CFLAGS) docsample.cpp -odocsample
diff --git a/v2_1/samples/docsample.cpp b/v2_1/samples/docsample.cpp
new file mode 100644
index 0000000..fc8f6f3
--- /dev/null
+++ b/v2_1/samples/docsample.cpp
@@ -0,0 +1,63 @@
+#include "../source/utf8.h"
+#include
+#include
+#include
+#include
+
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+ if (argc != 2) {
+ cout << "\nUsage: docsample filename\n";
+ return 0;
+ }
+ const char* test_file_path = argv[1];
+ // Open the test file (must be UTF-8 encoded)
+ ifstream fs8(test_file_path);
+ if (!fs8.is_open()) {
+ cout << "Could not open " << test_file_path << endl;
+ return 0;
+ }
+
+ // Read the first line of the file
+ unsigned line_count = 1;
+ string line;
+ if (!getline(fs8, line))
+ return 0;
+
+ // Look for utf-8 byte-order mark at the beginning
+ if (line.size() > 2) {
+ if (utf8::is_bom(line.c_str()))
+ cout << "There is a byte order mark at the beginning of the file\n";
+ }
+
+ // Play with all the lines in the file
+ do {
+ // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+ string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+ if (end_it != line.end()) {
+ cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+ cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+ }
+ // Get the line length (at least for the valid part)
+ int length = utf8::distance(line.begin(), end_it);
+ cout << "Length of line " << line_count << " is " << length << "\n";
+
+ // Convert it to utf-16
+ vector utf16line;
+ utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+ // And back to utf-8;
+ string utf8line;
+ utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+ // Confirm that the conversion went OK:
+ if (utf8line != string(line.begin(), end_it))
+ cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
+
+ getline(fs8, line);
+ line_count++;
+ } while (!fs8.eof());
+
+ return 0;
+}
diff --git a/v2_1/source/utf8.h b/v2_1/source/utf8.h
new file mode 100644
index 0000000..4e44514
--- /dev/null
+++ b/v2_1/source/utf8.h
@@ -0,0 +1,34 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "utf8/checked.h"
+#include "utf8/unchecked.h"
+
+#endif // header guard
diff --git a/v2_1/source/utf8/checked.h b/v2_1/source/utf8/checked.h
new file mode 100644
index 0000000..dc342ff
--- /dev/null
+++ b/v2_1/source/utf8/checked.h
@@ -0,0 +1,312 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include
+
+namespace utf8
+{
+ // Exceptions that may be thrown from the library functions.
+ class invalid_code_point : public std::exception {
+ uint32_t cp;
+ public:
+ invalid_code_point(uint32_t cp) : cp(cp) {}
+ virtual const char* what() const throw() { return "Invalid code point"; }
+ uint32_t code_point() const {return cp;}
+ };
+
+ class invalid_utf8 : public std::exception {
+ uint8_t u8;
+ public:
+ invalid_utf8 (uint8_t u) : u8(u) {}
+ virtual const char* what() const throw() { return "Invalid UTF-8"; }
+ uint8_t utf8_octet() const {return u8;}
+ };
+
+ class invalid_utf16 : public std::exception {
+ uint16_t u16;
+ public:
+ invalid_utf16 (uint16_t u) : u16(u) {}
+ virtual const char* what() const throw() { return "Invalid UTF-16"; }
+ uint16_t utf16_word() const {return u16;}
+ };
+
+ class not_enough_room : public std::exception {
+ public:
+ virtual const char* what() const throw() { return "Not enough space"; }
+ };
+
+ /// The library API - functions intended to be called by the users
+
+ template
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+ {
+ while (start != end) {
+ octet_iterator sequence_start = start;
+ internal::utf_error err_code = internal::validate_next(start, end);
+ switch (err_code) {
+ case internal::OK :
+ for (octet_iterator it = sequence_start; it != start; ++it)
+ *out++ = *it;
+ break;
+ case internal::NOT_ENOUGH_ROOM:
+ throw not_enough_room();
+ case internal::INVALID_LEAD:
+ append (replacement, out);
+ ++start;
+ break;
+ case internal::INCOMPLETE_SEQUENCE:
+ case internal::OVERLONG_SEQUENCE:
+ case internal::INVALID_CODE_POINT:
+ append (replacement, out);
+ ++start;
+ // just one replacement mark for the sequence
+ while (internal::is_trail(*start) && start != end)
+ ++start;
+ break;
+ }
+ }
+ return out;
+ }
+
+ template
+ inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+ {
+ static const uint32_t replacement_marker = internal::mask16(0xfffd);
+ return replace_invalid(start, end, out, replacement_marker);
+ }
+
+ template
+ octet_iterator append(uint32_t cp, octet_iterator result)
+ {
+ if (!internal::is_code_point_valid(cp))
+ throw invalid_code_point(cp);
+
+ if (cp < 0x80) // one octet
+ *(result++) = static_cast(cp);
+ else if (cp < 0x800) { // two octets
+ *(result++) = static_cast((cp >> 6) | 0xc0);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ else if (cp < 0x10000) { // three octets
+ *(result++) = static_cast((cp >> 12) | 0xe0);
+ *(result++) = static_cast((cp >> 6) & 0x3f | 0x80);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ else if (cp <= internal::CODE_POINT_MAX) { // four octets
+ *(result++) = static_cast((cp >> 18) | 0xf0);
+ *(result++) = static_cast((cp >> 12)& 0x3f | 0x80);
+ *(result++) = static_cast((cp >> 6) & 0x3f | 0x80);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ else
+ throw invalid_code_point(cp);
+
+ return result;
+ }
+
+ template
+ uint32_t next(octet_iterator& it, octet_iterator end)
+ {
+ uint32_t cp = 0;
+ internal::utf_error err_code = internal::validate_next(it, end, &cp);
+ switch (err_code) {
+ case internal::OK :
+ break;
+ case internal::NOT_ENOUGH_ROOM :
+ throw not_enough_room();
+ case internal::INVALID_LEAD :
+ case internal::INCOMPLETE_SEQUENCE :
+ case internal::OVERLONG_SEQUENCE :
+ throw invalid_utf8(*it);
+ case internal::INVALID_CODE_POINT :
+ throw invalid_code_point(cp);
+ }
+ return cp;
+ }
+
+ template
+ uint32_t prior(octet_iterator& it, octet_iterator start)
+ {
+ octet_iterator end = it;
+ while (internal::is_trail(*(--it)))
+ if (it < start)
+ throw invalid_utf8(*it); // error - no lead byte in the sequence
+ octet_iterator temp = it;
+ return next(temp, end);
+ }
+
+ /// Deprecated in versions that include "prior"
+ template
+ uint32_t previous(octet_iterator& it, octet_iterator pass_start)
+ {
+ octet_iterator end = it;
+ while (internal::is_trail(*(--it)))
+ if (it == pass_start)
+ throw invalid_utf8(*it); // error - no lead byte in the sequence
+ octet_iterator temp = it;
+ return next(temp, end);
+ }
+
+ template
+ void advance (octet_iterator& it, distance_type n, octet_iterator end)
+ {
+ for (distance_type i = 0; i < n; ++i)
+ next(it, end);
+ }
+
+ template
+ typename std::iterator_traits::difference_type
+ distance (octet_iterator first, octet_iterator last)
+ {
+ typename std::iterator_traits::difference_type dist;
+ for (dist = 0; first < last; ++dist)
+ next(first, last);
+ return dist;
+ }
+
+ template
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = internal::mask16(*start++);
+ // Take care of surrogate pairs first
+ if (internal::is_surrogate(cp)) {
+ if (start != end) {
+ uint32_t trail_surrogate = internal::mask16(*start++);
+ if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
+ cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+ else
+ throw invalid_utf16(static_cast(trail_surrogate));
+ }
+ else
+ throw invalid_utf16(static_cast(*start));
+
+ }
+ result = append(cp, result);
+ }
+ return result;
+ }
+
+ template
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = next(start, end);
+ if (cp > 0xffff) { //make a surrogate pair
+ *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET);
+ *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+ }
+ else
+ *result++ = static_cast(cp);
+ }
+ return result;
+ }
+
+ template
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+ {
+ while (start != end)
+ result = append(*(start++), result);
+
+ return result;
+ }
+
+ template
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+ {
+ while (start < end)
+ (*result++) = next(start, end);
+
+ return result;
+ }
+
+ // The iterator class
+ template
+ class iterator : public std::iterator {
+ octet_iterator it;
+ octet_iterator range_start;
+ octet_iterator range_end;
+ public:
+ iterator () {};
+ explicit iterator (const octet_iterator& octet_it,
+ const octet_iterator& range_start,
+ const octet_iterator& range_end) :
+ it(octet_it), range_start(range_start), range_end(range_end)
+ {
+ if (it < range_start || it > range_end)
+ throw std::out_of_range("Invalid utf-8 iterator position");
+ }
+ // the default "big three" are OK
+ octet_iterator base () const { return it; }
+ uint32_t operator * () const
+ {
+ octet_iterator temp = it;
+ return next(temp, range_end);
+ }
+ bool operator == (const iterator& rhs) const
+ {
+ if (range_start != rhs.range_start || range_end != rhs.range_end)
+ throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+ return (it == rhs.it);
+ }
+ bool operator != (const iterator& rhs) const
+ {
+ return !(operator == (rhs));
+ }
+ iterator& operator ++ ()
+ {
+ next(it, range_end);
+ return *this;
+ }
+ iterator operator ++ (int)
+ {
+ iterator temp = *this;
+ next(it, range_end);
+ return temp;
+ }
+ iterator& operator -- ()
+ {
+ prior(it, range_start);
+ return *this;
+ }
+ iterator operator -- (int)
+ {
+ iterator temp = *this;
+ prior(it, range_start);
+ return temp;
+ }
+ }; // class iterator
+
+} // namespace utf8
+
+#endif //header guard
+
+
diff --git a/v2_1/source/utf8/core.h b/v2_1/source/utf8/core.h
new file mode 100644
index 0000000..ca8c94a
--- /dev/null
+++ b/v2_1/source/utf8/core.h
@@ -0,0 +1,259 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include
+
+namespace utf8
+{
+ // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
+ // You may need to change them to match your system.
+ // These typedefs have the same names as ones from cstdint, or boost/cstdint
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+ // Unicode constants
+ // Leading (high) surrogates: 0xd800 - 0xdbff
+ // Trailing (low) surrogates: 0xdc00 - 0xdfff
+ const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
+ const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
+ const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+ const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+ const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
+ const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
+
+ // Maximum valid value for a Unicode code point
+ const uint32_t CODE_POINT_MAX = 0x0010ffffu;
+
+ template
+ inline uint8_t mask8(octet_type oc)
+ {
+ return static_cast(0xff & oc);
+ }
+ template
+ inline uint16_t mask16(u16_type oc)
+ {
+ return static_cast(0xffff & oc);
+ }
+ template
+ inline bool is_trail(octet_type oc)
+ {
+ return ((mask8(oc) >> 6) == 0x2);
+ }
+
+ template
+ inline bool is_surrogate(u16 cp)
+ {
+ return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+ }
+
+ template
+ inline bool is_code_point_valid(u32 cp)
+ {
+ return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
+ }
+
+ template
+ inline typename std::iterator_traits::difference_type
+ sequence_length(octet_iterator lead_it)
+ {
+ uint8_t lead = mask8(*lead_it);
+ if (lead < 0x80)
+ return 1;
+ else if ((lead >> 5) == 0x6)
+ return 2;
+ else if ((lead >> 4) == 0xe)
+ return 3;
+ else if ((lead >> 3) == 0x1e)
+ return 4;
+ else
+ return 0;
+ }
+
+ enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+ template
+ utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
+ {
+ uint32_t cp = mask8(*it);
+ // Check the lead octet
+ typedef typename std::iterator_traits::difference_type octet_difference_type;
+ octet_difference_type length = sequence_length(it);
+
+ // "Shortcut" for ASCII characters
+ if (length == 1) {
+ if (end - it > 0) {
+ if (code_point)
+ *code_point = cp;
+ ++it;
+ return OK;
+ }
+ else
+ return NOT_ENOUGH_ROOM;
+ }
+
+ // Do we have enough memory?
+ if (std::distance(it, end) < length)
+ return NOT_ENOUGH_ROOM;
+
+ // Check trail octets and calculate the code point
+ switch (length) {
+ case 0:
+ return INVALID_LEAD;
+ break;
+ case 2:
+ if (is_trail(*(++it))) {
+ cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+ }
+ else {
+ --it;
+ return INCOMPLETE_SEQUENCE;
+ }
+ break;
+ case 3:
+ if (is_trail(*(++it))) {
+ cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
+ if (is_trail(*(++it))) {
+ cp += (*it) & 0x3f;
+ }
+ else {
+ std::advance(it, -2);
+ return INCOMPLETE_SEQUENCE;
+ }
+ }
+ else {
+ --it;
+ return INCOMPLETE_SEQUENCE;
+ }
+ break;
+ case 4:
+ if (is_trail(*(++it))) {
+ cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
+ if (is_trail(*(++it))) {
+ cp += (mask8(*it) << 6) & 0xfff;
+ if (is_trail(*(++it))) {
+ cp += (*it) & 0x3f;
+ }
+ else {
+ std::advance(it, -3);
+ return INCOMPLETE_SEQUENCE;
+ }
+ }
+ else {
+ std::advance(it, -2);
+ return INCOMPLETE_SEQUENCE;
+ }
+ }
+ else {
+ --it;
+ return INCOMPLETE_SEQUENCE;
+ }
+ break;
+ }
+ // Is the code point valid?
+ if (!is_code_point_valid(cp)) {
+ for (octet_difference_type i = 0; i < length - 1; ++i)
+ --it;
+ return INVALID_CODE_POINT;
+ }
+
+ if (code_point)
+ *code_point = cp;
+
+ if (cp < 0x80) {
+ if (length != 1) {
+ std::advance(it, -(length-1));
+ return OVERLONG_SEQUENCE;
+ }
+ }
+ else if (cp < 0x800) {
+ if (length != 2) {
+ std::advance(it, -(length-1));
+ return OVERLONG_SEQUENCE;
+ }
+ }
+ else if (cp < 0x10000) {
+ if (length != 3) {
+ std::advance(it, -(length-1));
+ return OVERLONG_SEQUENCE;
+ }
+ }
+
+ ++it;
+ return OK;
+ }
+
+ template
+ inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+ return validate_next(it, end, 0);
+ }
+
+} // namespace internal
+
+ /// The library API - functions intended to be called by the users
+
+ // Byte order mark
+ const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+
+ template
+ octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+ {
+ octet_iterator result = start;
+ while (result != end) {
+ internal::utf_error err_code = internal::validate_next(result, end);
+ if (err_code != internal::OK)
+ return result;
+ }
+ return result;
+ }
+
+ template
+ inline bool is_valid(octet_iterator start, octet_iterator end)
+ {
+ return (find_invalid(start, end) == end);
+ }
+
+ template
+ inline bool is_bom (octet_iterator it)
+ {
+ return (
+ (internal::mask8(*it++)) == bom[0] &&
+ (internal::mask8(*it++)) == bom[1] &&
+ (internal::mask8(*it)) == bom[2]
+ );
+ }
+} // namespace utf8
+
+#endif // header guard
+
+
diff --git a/v2_1/source/utf8/unchecked.h b/v2_1/source/utf8/unchecked.h
new file mode 100644
index 0000000..2277d28
--- /dev/null
+++ b/v2_1/source/utf8/unchecked.h
@@ -0,0 +1,221 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+
+namespace utf8
+{
+ namespace unchecked
+ {
+ template
+ octet_iterator append(uint32_t cp, octet_iterator result)
+ {
+ if (cp < 0x80) // one octet
+ *(result++) = static_cast(cp);
+ else if (cp < 0x800) { // two octets
+ *(result++) = static_cast((cp >> 6) | 0xc0);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ else if (cp < 0x10000) { // three octets
+ *(result++) = static_cast((cp >> 12) | 0xe0);
+ *(result++) = static_cast((cp >> 6) & 0x3f | 0x80);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ else { // four octets
+ *(result++) = static_cast((cp >> 18) | 0xf0);
+ *(result++) = static_cast((cp >> 12)& 0x3f | 0x80);
+ *(result++) = static_cast((cp >> 6) & 0x3f | 0x80);
+ *(result++) = static_cast((cp & 0x3f) | 0x80);
+ }
+ return result;
+ }
+ template
+ uint32_t next(octet_iterator& it)
+ {
+ uint32_t cp = internal::mask8(*it);
+ typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it);
+ switch (length) {
+ case 1:
+ break;
+ case 2:
+ it++;
+ cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+ break;
+ case 3:
+ ++it;
+ cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
+ ++it;
+ cp += (*it) & 0x3f;
+ break;
+ case 4:
+ ++it;
+ cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);
+ ++it;
+ cp += (internal::mask8(*it) << 6) & 0xfff;
+ ++it;
+ cp += (*it) & 0x3f;
+ break;
+ }
+ ++it;
+ return cp;
+ }
+
+ template
+ uint32_t prior(octet_iterator& it)
+ {
+ while (internal::is_trail(*(--it))) ;
+ octet_iterator temp = it;
+ return next(temp);
+ }
+
+ // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
+ template
+ inline uint32_t previous(octet_iterator& it)
+ {
+ return prior(it);
+ }
+
+ template
+ void advance (octet_iterator& it, distance_type n)
+ {
+ for (distance_type i = 0; i < n; ++i)
+ next(it);
+ }
+
+ template
+ typename std::iterator_traits::difference_type
+ distance (octet_iterator first, octet_iterator last)
+ {
+ typename std::iterator_traits::difference_type dist;
+ for (dist = 0; first < last; ++dist)
+ next(first);
+ return dist;
+ }
+
+ template
+ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = internal::mask16(*start++);
+ // Take care of surrogate pairs first
+ if (internal::is_surrogate(cp)) {
+ uint32_t trail_surrogate = internal::mask16(*start++);
+ cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+ }
+ result = append(cp, result);
+ }
+ return result;
+ }
+
+ template
+ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+ {
+ while (start != end) {
+ uint32_t cp = next(start);
+ if (cp > 0xffff) { //make a surrogate pair
+ *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET);
+ *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+ }
+ else
+ *result++ = static_cast(cp);
+ }
+ return result;
+ }
+
+ template
+ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+ {
+ while (start != end)
+ result = append(*(start++), result);
+
+ return result;
+ }
+
+ template
+ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+ {
+ while (start < end)
+ (*result++) = next(start);
+
+ return result;
+ }
+
+ // The iterator class
+ template
+ class iterator : public std::iterator {
+ octet_iterator it;
+ public:
+ iterator () {};
+ explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
+ // the default "big three" are OK
+ octet_iterator base () const { return it; }
+ uint32_t operator * () const
+ {
+ octet_iterator temp = it;
+ return next(temp);
+ }
+ bool operator == (const iterator& rhs) const
+ {
+ return (it == rhs.it);
+ }
+ bool operator != (const iterator& rhs) const
+ {
+ return !(operator == (rhs));
+ }
+ iterator& operator ++ ()
+ {
+ std::advance(it, internal::sequence_length(it));
+ return *this;
+ }
+ iterator operator ++ (int)
+ {
+ iterator temp = *this;
+ std::advance(it, internal::sequence_length(it));
+ return temp;
+ }
+ iterator& operator -- ()
+ {
+ prior(it);
+ return *this;
+ }
+ iterator operator -- (int)
+ {
+ iterator temp = *this;
+ prior(it);
+ return temp;
+ }
+ }; // class iterator
+
+ } // namespace utf8::unchecked
+} // namespace utf8
+
+
+#endif // header guard
+
diff --git a/v2_1/test_data/negative/utf8_invalid.txt b/v2_1/test_data/negative/utf8_invalid.txt
new file mode 100755
index 0000000..abd16f7
Binary files /dev/null and b/v2_1/test_data/negative/utf8_invalid.txt differ
diff --git a/v2_1/test_data/utf8samples/UTF-8-demo.txt b/v2_1/test_data/utf8samples/UTF-8-demo.txt
new file mode 100644
index 0000000..4363f27
--- /dev/null
+++ b/v2_1/test_data/utf8samples/UTF-8-demo.txt
@@ -0,0 +1,212 @@
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
diff --git a/v2_1/test_data/utf8samples/Unicode_transcriptions.html b/v2_1/test_data/utf8samples/Unicode_transcriptions.html
new file mode 100644
index 0000000..69b29ff
--- /dev/null
+++ b/v2_1/test_data/utf8samples/Unicode_transcriptions.html
@@ -0,0 +1,167 @@
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
diff --git a/v2_1/test_data/utf8samples/big.txt b/v2_1/test_data/utf8samples/big.txt
new file mode 100644
index 0000000..1038482
--- /dev/null
+++ b/v2_1/test_data/utf8samples/big.txt
@@ -0,0 +1,420160 @@
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+ | Forms
+ |
+Home .
+
+
+Name Text Image
+Arabic (Arabic) يونِكود ?
+Arabic (Persian) یونیکُد / ?/
+Armenian Յունիկօդ
+Bengali য়ূনিকোড
+Bopomofo ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅
+Braille
+Buhid
+Canadian Aboriginal ᔫᗂᑰᑦ
+Cherokee ᏳᏂᎪᏛ
+Cypriot
+Cyrillic (Russian) Юникод ?
+Deseret (English) ???????
+Devanagari (Hindi) यूनिकोड ?
+Ethiopic ዩኒኮድ
+Georgian უნიკოდი ?
+Gothic
+Greek Γιούνικοντ
+Gujarati યૂનિકોડ
+Gurmukhi ਯੂਨਿਕੋਡ
+Han (Chinese) 统一码 ?
+統一碼 ?
+万国码 ?
+萬國碼 ?
+Hangul 유니코드
+Hanunoo
+Hebrew יוניקוד
+Hebrew (pointed) יוּנִיקוׁד
+Hebrew (Yiddish) יוניקאָד ?
+Hiragana (Japanese) ゆにこおど
+Katakana (Japanese) ユニコード ?
+Kannada ಯೂನಿಕೋಡ್
+Khmer យូនីគោដ
+Lao
+Latin Unicode Unicode
+Latin (IPA <#English_Pronunciation>) ˈjunɪˌkoːd ?
+Latin (Am. Dict. <#American_Dictionary>) Ūnĭcōde̽ ?
+Limbu
+Linear B
+Malayalam യൂനികോഡ്
+Mongolian
+Myanmar
+Ogham ᚔᚒᚅᚔᚉᚑᚇ / /
+Old Italic
+Oriya ୟୂନିକୋଡ
+Osmanya
+Runic (Anglo-Saxon) ᛡᚢᚾᛁᚳᚩᛞ
+Shavian
+Sinhala යණනිකෞද්
+Syriac ܝܘܢܝܩܘܕ
+Tagbanwa
+Tagalog
+Tai Le
+Tamil யூனிகோட்
+Telugu యూనికోడ్
+Thaana
+Thai ยูนืโคด
+Tibetan (Dzongkha) ཨུ་ནི་ཀོཌྲ།
+Ugaritic
+Yi
+
+
+ Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+, using the directions below:
+
+ * *Supplying Missing Items*
+ o Most Latin-script languages will follow the spelling, and
+ change the pronunciation. For any that would not, it would
+ be good to have the alternate spelling.
+ o For non-Latin scripts the goal is to match the English
+ pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+ (in phonemic transcription) that should be matched as
+ closely as possible (without sounding affected in the target
+ language)
+ o Text would be best in either the UTF-8 text, or the code
+ points in hex HTML. E.g. either of the following:
+ + "Юникод"
+ + "Юникод"
+ + Note: for / supplementary characters/
+ ,
+ there should be one hex number per code point, not two
+ surrogates
+ :
+ # 𐀀 /*not*/ &xDC00;
+ o If you have a good font, I'd also appreciate a GIF. It
+ should be *96 x 24* bits, with the text centered, in black
+ on white (plus grays if smoothed).
+ * *Other Comments*
+ o Because some browsers won't handle the text, both text and
+ GIF image are supplied. If you can’t read the text columns,
+ see Display Problems
+ .
+ o The Chinese versions (inc. Bopomofo) are translations, not
+ transcriptions, since "transcription in Chinese is pretty
+ lame" [J. Becker].
+ o There are other "translations" of Unicode that may be in
+ use, such as the Vietnamese "Thống Nhất Mã".
+ o For sample pages in different languages on the Unicode site,
+ see What is Unicode?
+
+ o Americans are not generally used to IPA, and find a variety
+ of different systems in their dictionaries. This one leaves
+ the base letters as they are, and uses diacritics for
+ pronunciation.
+ * *Etymology of /Unicode/*
+ o Coined by J. Becker. Not related to previous usages, such as:
+ + A telegraphic code in which one word or set of letters
+ represents a sentence or phrase; a telegram or message
+ in this. (late 19th century, OED)
+ o According to my references, the prefix "uni" is directly
+ from Latin while the word "code" is through French.
+ o The original Indo-European apparently would have been
+ *oino-kau-do ("one strike give"): *kau apparently being
+ related to such English words as: hew, haggle, hoe, hag,
+ hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+ codicil, coward, incus, and Kovač (personal name: "smith").
+ + I will leave the exact derivations to the exegetes,
+ but I like the association with "haggle" myself.
+ * *Contributions*
+ o This draws on contributions or comments from:
+ + Dixon Au
+ + Joe Becker
+ + Maurice Bauhahn
+ + Abel Cheung
+ + Peter Constable
+ + Michael Everson
+ + Christopher John Fynn
+ + Michael Kaplan
+ + George Kiraz
+ + Abdul Malik
+ + Siva Nataraja
+ + Roozbeh Pournader
+ + Jonathan Rosenne
+ + Jungshik Shin
+
+------------------------------------------------------------------------
+
+
+Terms of Use . Last updated:
+MED - 04/20/2003 15:30:33.
+
+
+
+
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
+? *Unicode Transcriptions* Notes <#Notes>
+
+Glyphs | Samples
+ | Charts
+ | UTF
+