Get rid of the cp parameter.

This is an intermediate commit and the build might be broken.
The library will not be functional.
This commit is contained in:
King_DuckZ 2014-06-30 10:43:52 +02:00
parent fd3a7a40e9
commit 1f805eb858
6 changed files with 234 additions and 184 deletions

View file

@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE.
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/functions.hpp"
#include "utf8/iterator.hpp"
#include "utf8/unchecked.hpp"
#endif // header guard

View file

@ -131,44 +131,33 @@ namespace internal
return false;
}
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
enum utf_error {UTF8_OK, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
/// Helper for get_sequence_x
template <typename octet_iterator>
utf_error increase_safely(octet_iterator& it, octet_iterator end)
utf_error increase_safely(octet_iterator& it)
{
if (++it == end)
return NOT_ENOUGH_ROOM;
if (!utf8::internal::is_trail(*it))
return INCOMPLETE_SEQUENCE;
return UTF8_OK;
}
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT) {utf_error ret = increase_safely(IT); if (ret != UTF8_OK) return ret;}
/// get_sequence_x functions decode utf-8 sequences of the length x
template <typename octet_iterator>
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
void get_sequence_1(octet_iterator& it, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_2(octet_iterator& it, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
@ -176,18 +165,15 @@ namespace internal
}
template <typename octet_iterator>
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_3(octet_iterator& it, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point += (*it) & 0x3f;
@ -195,22 +181,19 @@ namespace internal
}
template <typename octet_iterator>
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error get_sequence_4(octet_iterator& it, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it)
code_point += (*it) & 0x3f;
@ -220,7 +203,7 @@ namespace internal
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
utf_error validate_next(octet_iterator& it, uint32_t& code_point)
{
// Save the original value of it so we can go back in case of failure
// Of course, it does not make much sense with i.e. stream iterators
@ -237,16 +220,16 @@ namespace internal
case 0:
return INVALID_LEAD;
case 1:
err = utf8::internal::get_sequence_1(it, end, cp);
utf8::internal::get_sequence_1(it, cp);
break;
case 2:
err = utf8::internal::get_sequence_2(it, end, cp);
err = utf8::internal::get_sequence_2(it, cp);
break;
case 3:
err = utf8::internal::get_sequence_3(it, end, cp);
err = utf8::internal::get_sequence_3(it, cp);
break;
case 4:
err = utf8::internal::get_sequence_4(it, end, cp);
err = utf8::internal::get_sequence_4(it, cp);
break;
}
@ -272,9 +255,9 @@ namespace internal
}
template <typename octet_iterator>
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
inline utf_error validate_next(octet_iterator& it) {
uint32_t ignored;
return utf8::internal::validate_next(it, end, ignored);
return utf8::internal::validate_next(it, ignored);
}
} // namespace internal
@ -289,7 +272,7 @@ namespace internal
{
octet_iterator result = start;
while (result != end) {
utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
utf8::internal::utf_error err_code = utf8::internal::validate_next(result);
if (err_code != internal::UTF8_OK)
return result;
}

View file

@ -1,4 +1,5 @@
// Copyright 2006 Nemanja Trifunovic
// Copyright 2014 Michele Santullo
/*
Permission is hereby granted, free of charge, to any person or organization
@ -30,13 +31,10 @@ DEALINGS IN THE SOFTWARE.
#include "core.hpp"
#include "exception.hpp"
#include <cassert>
#include <cstddef>
namespace utf8
{
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
@ -68,14 +66,12 @@ namespace utf8
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = utf8::internal::validate_next(start, end);
internal::utf_error err_code = utf8::internal::validate_next(start);
switch (err_code) {
case internal::UTF8_OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
throw not_enough_room();
case internal::INVALID_LEAD:
out = utf8::append (replacement, out);
++start;
@ -102,15 +98,13 @@ namespace utf8
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it, octet_iterator end)
uint32_t next(octet_iterator& it)
{
uint32_t cp = 0;
internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
internal::utf_error err_code = utf8::internal::validate_next(it, cp);
switch (err_code) {
case internal::UTF8_OK :
break;
case internal::NOT_ENOUGH_ROOM :
throw not_enough_room();
case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE :
@ -122,43 +116,25 @@ namespace utf8
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it, octet_iterator end)
uint32_t peek_next(octet_iterator it)
{
return utf8::next(it, end);
return utf8::next(it);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
uint32_t prior(octet_iterator& it)
{
// can't do much if it == start
if (it == start)
throw not_enough_room();
octet_iterator end = it;
// Go back until we hit either a lead octet or start
while (utf8::internal::is_trail(*(--it)))
if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
return utf8::peek_next(it, end);
}
/// Deprecated in versions that include "prior"
template <typename octet_iterator>
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
{
octet_iterator end = it;
while (utf8::internal::is_trail(*(--it)))
if (it == pass_start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
octet_iterator temp = it;
return utf8::next(temp, end);
while (utf8::internal::is_trail(*(--it)));
return utf8::peek_next(it);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
void advance (octet_iterator& it, distance_type n)
{
for (distance_type i = 0; i < n; ++i)
utf8::next(it, end);
utf8::next(it);
}
template <typename octet_iterator>
@ -167,7 +143,7 @@ namespace utf8
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::next(first, last);
utf8::next(first);
return dist;
}
@ -202,7 +178,7 @@ namespace utf8
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
uint32_t cp = utf8::next(start, end);
uint32_t cp = utf8::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
@ -226,110 +202,10 @@ namespace utf8
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start != end)
(*result++) = utf8::next(start, end);
(*result++) = utf8::next(start);
return result;
}
// Error policies for the iterator class
template <typename I>
class ErrorPolicyThrow {
public:
static void check_in_range(const I& it, const I& range_start, const I& range_end)
{
if (it < range_start || it > range_end)
throw std::out_of_range("Invalid utf-8 iterator position");
}
static void check_same_range(const I& range_start_a, const I& range_start_b, const I& range_end_a, const I& range_end_b)
{
if (range_start_a != range_start_b || range_end_a != range_end_b)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
}
};
template <typename I>
class ErrorPolicyAssert {
public:
static void check_in_range(const I& it, const I& range_start, const I& range_end)
{
#if defined(NDEBUG)
(void)it;
(void)range_start;
(void)range_end;
#else
assert(it >= range_start && it <= range_end);
#endif
}
static void check_same_range(const I& range_start_a, const I& range_start_b, const I& range_end_a, const I& range_end_b)
{
#if defined(NDEBUG)
(void)range_start_a;
(void)range_start_b;
(void)range_end_a;
(void)range_end_b;
#else
assert(range_start_a == range_start_b && range_end_a == range_end_b);
#endif
}
};
// The iterator class
template <
typename octet_iterator,
typename error_policy=ErrorPolicyThrow<octet_iterator>
>
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t, std::ptrdiff_t, uint32_t*, uint32_t> {
octet_iterator it;
octet_iterator range_start;
octet_iterator range_end;
public:
iterator () {}
explicit iterator (const octet_iterator& octet_it,
const octet_iterator& range_start,
const octet_iterator& range_end) :
it(octet_it), range_start(range_start), range_end(range_end)
{
error_policy::check_in_range(it, range_start, range_end);
}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::next(temp, range_end);
}
bool operator == (const iterator& rhs) const
{
error_policy::check_same_range(range_start, rhs.range_start, range_end, rhs.range_end);
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
utf8::next(it, range_end);
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
utf8::next(it, range_end);
return temp;
}
iterator& operator -- ()
{
utf8::prior(it, range_start);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::prior(it, range_start);
return temp;
}
}; // class iterator
} // namespace utf8
#endif //header guard

187
src/utf8/iterator.hpp Normal file
View file

@ -0,0 +1,187 @@
// Copyright 2014 Michele Santullo
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef idBC7032D05A9B41979D44D8E6BF40BAA9
#define idBC7032D05A9B41979D44D8E6BF40BAA9
#include <cassert>
#include <iterator>
namespace utf8 {
// Error policies for the iterator class
template <typename I>
class range_policy_throw {
public:
range_policy_throw ( const range_policy_throw& ) = delete;
range_policy_throw ( void ) = delete;
range_policy_throw ( range_policy_throw&& ) = delete;
range_policy_throw& operator= ( const range_policy_throw& ) = delete;
range_policy_throw ( const I& range_start, const I& range_end ) :
m_range_start(range_start),
m_range_end(range_end)
{
}
void operator() ( const I& it ) const {
if (it < m_range_start || it > m_range_end)
throw std::out_of_range("Invalid utf-8 iterator position");
}
void operator() ( const I& range_start, const I& range_end ) {
if (m_range_start != range_start || m_range_end != range_end)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
}
private:
I m_range_start;
I m_range_end;
};
// template <typename I>
// class range_policy_assert {
// public:
// range_policy_assert (
// static void check_in_range(const I& it, const I& range_start, const I& range_end)
// {
//#if defined(NDEBUG)
// (void)it;
// (void)range_start;
// (void)range_end;
//#else
// assert(it >= range_start && it <= range_end);
//#endif
// }
// static void check_same_range(const I& range_start_a, const I& range_start_b, const I& range_end_a, const I& range_end_b)
// {
//#if defined(NDEBUG)
// (void)range_start_a;
// (void)range_start_b;
// (void)range_end_a;
// (void)range_end_b;
//#else
// assert(range_start_a == range_start_b && range_end_a == range_end_b);
//#endif
// }
// };
template <typename C>
struct utf_policy_replace {
C operator() ( C value ) const {
return static_cast<C>('?');
}
};
template <typename C> struct utf_policy_throw;
template <> struct utf_policy_throw<uint8_t> : private utf_policy_replace<uint8_t> {
char operator() ( uint8_t value ) const {
throw utf8::invalid_utf8(value);
return utf_policy_replace<uint8_t>::operator()(value);
}
};
template <> struct utf_policy_throw<uint16_t> : private utf_policy_replace<uint16_t> {
char operator() ( uint16_t value ) const {
throw utf8::invalid_utf16(value);
return utf_policy_replace<uint16_t>::operator()(value);
}
};
template <> struct utf_policy_throw<uint32_t> : private utf_policy_replace<uint32_t> {
char operator() ( uint32_t value ) const {
throw utf8::invalid_code_point(value);
return utf_policy_replace<uint32_t>::operator()(value);
}
};
template <typename C>
struct utf_policy_assert : private utf_policy_replace<C> {
char operator() ( uint8_t value ) const {
assert(false);
return utf_policy_replace<C>::operator()(value);
}
};
/// The iterator class
template <
typename octet_iterator,
typename utf_error_policy=utf_policy_throw<typename std::iterator_traits<octet_iterator>::value_type>
>
class iterator : public std::iterator <typename std::iterator_traits<octet_iterator>::iterator_category, uint32_t> {
public:
typedef typename std::iterator_traits<octet_iterator>::difference_type difference_type;
iterator ( const iterator& rhs ) = default;
iterator ( iterator&& rhs ) = default;
explicit iterator (const octet_iterator&& octet_it) :
m_it(std::move(octet_it))
{
}
explicit iterator (const octet_iterator& octet_it) :
m_it(octet_it)
{
}
~iterator ( void ) noexcept(noexcept(std::declval<octet_iterator>().~octet_iterator())) { }
octet_iterator base () const { return m_it; }
uint32_t operator* () const {
octet_iterator temp = m_it;
return utf8::next<utf_error_policy>(temp);
}
bool operator== (const iterator& rhs) const {
return (m_it == rhs.m_it);
}
bool operator!= (const iterator& rhs) const {
return !(operator== (rhs));
}
iterator& operator++ () {
utf8::next(m_it);
return *this;
}
iterator operator++ (int) {
iterator temp = *this;
utf8::next(m_it);
return temp;
}
iterator& operator-- () {
utf8::prior(m_it);
return *this;
}
iterator operator-- (int) {
iterator temp = *this;
utf8::prior(m_it);
return temp;
}
difference_type operator- (const iterator& rhs) const {
return m_it - rhs.m_it;
}
iterator& operator+= (difference_type inc) {
utf8::advance(m_it, inc);
}
public:
octet_iterator m_it;
}; // class iterator
} // namespace utf8
#endif

View file

@ -1,5 +1,8 @@
cmake_minimum_required(VERSION 2.6.4 FATAL_ERROR)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0 -std=c++11")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -std=c++11")
add_subdirectory(gtest-1.7.0)
set(GTEST_MAIN_CPP "${CMAKE_SOURCE_DIR}/gtest-1.7.0/src/gtest_main.cc")
set(UNITTEST_DATA_DIR "${CMAKE_SOURCE_DIR}/../data")

View file

@ -47,32 +47,32 @@ namespace {
unsigned char_count = 0;
std::string::iterator it = line_start;
while (it != line_end) {
unsigned int next_cp = utf8::peek_next(it, line_end);
EXPECT_EQ(utf8::next(it, line_end), next_cp) << "Line " << line_count << ": Error: peek_next gave a different result than next";
unsigned int next_cp = utf8::peek_next(it);
EXPECT_EQ(utf8::next(it), next_cp) << "Line " << line_count << ": Error: peek_next gave a different result than next";
char_count++;
}
EXPECT_EQ(char_count, utf32_line.size()) << "Line " << line_count << ": Error in iterating with next - wrong number of characters";
std::string::iterator adv_it = line_start;
utf8::advance(adv_it, char_count, line_end);
utf8::advance(adv_it, char_count);
EXPECT_EQ(adv_it, line_end) << "Line " << line_count << ": Error in advance function";
EXPECT_EQ(std::string::size_type(utf8::distance(line_start, line_end)), char_count) << "Line " << line_count << ": Error in distance function";
while (it != line_start) {
utf8::previous(it, line.rend().base());
utf8::prior(it);
char_count--;
}
EXPECT_EQ(char_count, 0) << "Line " << line_count << ": Error in iterating with previous - wrong number of characters";
EXPECT_EQ(char_count, 0) << "Line " << line_count << ": Error in iterating with prior - wrong number of characters";
// Try utf8::iterator
utf8::iterator<std::string::iterator> u8it(line_start, line_start, line_end);
utf8::iterator<std::string::iterator> u8it(line_start);
EXPECT_FALSE(not utf32_line.empty() and *u8it != utf32_line.at(0)) << "Line " << line_count << ": Error in utf::iterator * operator";
const size_t calculatedDist = std::distance(u8it, utf8::iterator<std::string::iterator>(line_end, line_start, line_end));
const size_t calculatedDist = std::distance(u8it, utf8::iterator<std::string::iterator>(line_end));
EXPECT_EQ(calculatedDist, static_cast<int>(utf32_line.size())) <<"Line " << line_count << ": Error in using utf::iterator with std::distance - wrong number of characters";
std::advance(u8it, utf32_line.size());
EXPECT_EQ(u8it, utf8::iterator<std::string::iterator>(line_end, line_start, line_end)) << "Line " << line_count << ": Error in using utf::iterator with std::advance";
EXPECT_EQ(u8it, utf8::iterator<std::string::iterator>(line_end)) << "Line " << line_count << ": Error in using utf::iterator with std::advance";
}
}
} //unnamed namespace