Browse Source

bundle UTF handling functions as static member functions of a UTF class template to circumvent prohibition of partial function specialization. *sigh*

master
roker 8 months ago
parent
commit
8205b7b9de
4 changed files with 113 additions and 69 deletions
  1. +69
    -52
      src/nfc.cc
  2. +36
    -9
      src/nfc.hh
  3. +4
    -4
      test/unittest_nfc.cc
  4. +4
    -4
      test/unittest_nfc16.cc

+ 69
- 52
src/nfc.cc View File

@ -352,52 +352,50 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
throw unexpected_end(-1);
}
template<class CharT>
uint32_t parseUtf(const CharT*& c, const CharT* end);
template<>
inline
uint32_t parseUtf<char>(const char*& c, const char* end)
uint32_t UTF<char>::parse(const char*& c, const char* end)
{
return parseUtf8(c,end);
}
template<>
inline
uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end)
uint32_t UTF<char16_t>::parse(const char16_t*& c, const char16_t* end)
{
return parseUtf16(c,end);
}
template<>
void toUtf<char>(const char32_t c, std::string& ret)
template<class OutIter>
void UTF<char>::generate(const char32_t c, OutIter& out)
{
if(c<=0x7F)
{
ret += char(c);
*out++ = char(c);
}else if(c<=0x7FF)
{
ret += char( 0xC0 + (c>>6) );
ret += char( 0x80 + (c & 63));
*out++ = char( 0xC0 + (c>>6) );
*out++ = char( 0x80 + (c & 63));
}else if(c<=0xFFFF)
{
ret += char( 0xE0 + (c>>12) );
ret += char( 0x80 + ((c>>6) & 63));
ret += char( 0x80 + (c & 63));
*out++ = char( 0xE0 + (c>>12) );
*out++ = char( 0x80 + ((c>>6) & 63));
*out++ = char( 0x80 + (c & 63));
}else if(c<=0x10FFFF)
{
ret += char( 0xF0 + (c>>18) );
ret += char( 0x80 + ((c>>12) & 63));
ret += char( 0x80 + ((c>>6) & 63));
ret += char( 0x80 + (c & 63));
*out++ = char( 0xF0 + (c>>18) );
*out++ = char( 0x80 + ((c>>12) & 63));
*out++ = char( 0x80 + ((c>>6) & 63));
*out++ = char( 0x80 + (c & 63));
}else{
throw too_big(0, c);
}
}
template<>
void toUtf<char16_t>(const char32_t c, std::u16string& ret)
template<class OutIter>
void UTF<char16_t>::generate(const char32_t c, OutIter& out)
{
if(c <= 0xFFFF)
{
@ -405,7 +403,7 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret)
{
throw unexpected_surrogate(c);
}else{
ret += char16_t(c);
*out++ = char16_t(c);
}
}else{ // surrogate pair
if(c>0x10FFFF)
@ -413,19 +411,20 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret)
throw too_big(0, c);
}else{
const uint32_t c_reduced = c - 0x10000;
ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
*out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
*out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
}
}
}
template<class CharT>
std::basic_string<CharT> toUtf(const std::u32string& u32)
std::basic_string<CharT> UTF<CharT>::generate(const std::u32string& u32)
{
std::basic_string<CharT> ret;
auto out = std::back_inserter(ret);
for(char32_t c : u32)
{
toUtf<CharT>(c, ret);
generate(c, out);
}
return ret;
}
@ -454,7 +453,7 @@ void assert_utf8(string_view s)
{
while(begin<end)
{
parseUtf8(begin, end); // ignore the output
UTF8::parse(begin, end); // ignore the output
++begin;
}
}
@ -467,7 +466,7 @@ void assert_utf8(string_view s)
// creates a NFD string from s
template<class CharT>
std::u32string fromUtf_decompose(basic_string_view<CharT> s)
std::u32string UTF<CharT>::fromUtf_decompose(basic_string_view<CharT> s)
{
std::u32string u32s;
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
@ -475,7 +474,7 @@ std::u32string fromUtf_decompose(basic_string_view<CharT> s)
const CharT* end = s.data() + s.size();
for(; begin<end; ++begin)
{
unsigned u = parseUtf(begin, end);
unsigned u = parse(begin, end);
u32s += decompose_full(u);
}
canonicalOrdering(u32s); // works inplace.
@ -552,7 +551,7 @@ std::u32string createNFC(std::u32string nfd)
template<class CharT>
IsNFC isNFC_quick_check(basic_string_view<CharT> s)
IsNFC UTF<CharT>::isNFC_quick_check(basic_string_view<CharT> s)
{
const CharT* begin = s.data();
const CharT* const end = s.data() + s.size();
@ -561,7 +560,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s)
unsigned last_cc = 0;
while(begin<end)
{
const uint32_t u = parseUtf(begin, end);
const uint32_t u = parse(begin, end);
const unsigned cc = canonicalClass(u);
if( (cc!=0) && (last_cc > cc) )
{
@ -582,7 +581,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s)
template<class CharT>
bool isNFC(basic_string_view<CharT> s)
bool UTF<CharT>::isNFC(basic_string_view<CharT> s)
{
switch( isNFC_quick_check(s) )
{
@ -598,19 +597,13 @@ bool isNFC(basic_string_view<CharT> s)
}
template bool isNFC<char>(string_view);
template bool isNFC<char16_t>(u16string_view);
// should be unecessary, but... well...
template std::string toNFC<char>(string_view);
template std::u16string toNFC<char16_t>(u16string_view);
bool isUtf8(const char* begin, const char* end)
template<>
bool UTF<char>::isUtf(const char* begin, const char* end)
try{
for(; begin<end; ++begin)
{
(void)parseUtf8(begin, end);
(void)parse(begin, end);
}
return true;
}catch(const illegal_utf&)
@ -621,12 +614,39 @@ try{
// s is ''moved'' to the return value if possible so no copy is done here.
template<class CharT>
std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
std::basic_string<CharT> UTF<CharT>::toNFC(basic_string_view<CharT> s)
{
if(isNFC_quick_check(s)==IsNFC::Yes)
return std::basic_string<CharT>{s};
return toUtf<CharT>( createNFC( fromUtf_decompose(s) ));
return generate( createNFC( fromUtf_decompose(s) ));
}
template<>
size_t UTF<char>::utf_length(u32string_view s)
{
size_t len = 0;
for(const char32_t c : s)
{
if(c <= 0x7f)
{
len += 1;
}else if(c<=0x7ff)
{
len += 2;
}else if(c<=0xffff)
{
len += 3;
}else if(c<=0x10ffff)
{
len += 4;
}else{
throw too_big(0, c);
}
}
return len;
}
@ -634,19 +654,17 @@ std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
// and unecessary temporary std::string etc.
char* strdup_NFC(string_view s)
{
if(isNFC_quick_check(s)==IsNFC::Yes)
if(UTF8::isNFC_quick_check(s)==IsNFC::Yes)
return ::new_string(s.data(), s.size());
// implement the hard way more efficient
/********** FIXME: need more re-work, so I'll do the dumb way first
const std::u32string& u32 = createNFC( fromUtf_decompose(s) );
const size_t out_len = utf8len(u32);
const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) );
const size_t out_len = UTF8::utf_length(u32);
char* ret = ::new_string(nullptr, out_len );
char* iter{ret};
for(const char32_t c : u32)
{
toUtf<char, char*>(c, iter);
UTF8::generate(c, iter);
}
if(iter > ret+out_len) // should never happen. ;)
@ -655,14 +673,13 @@ char* strdup_NFC(string_view s)
}
return ret;
********************/
// Correct but inefficient:
const std::string ret = toNFC<char>(s);
return ::new_string(ret.data(), 0);
}
template class UTF<char>;
template class UTF<char16_t>;
// used only to initialize the NFC Compose mapping:
std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
{


+ 36
- 9
src/nfc.hh View File

@ -38,30 +38,56 @@ class UTF
public:
/// parses a sequence of input code units into one Unicode code point and updates the input iterator c.
/// \todo change to iterator templates?
static
uint32_t parse(const CharT*& c, const CharT* end);
/// generates a UTF sequence from a given Unicode code point
/// generates a UTF sequence from a given Unicode code point.
template<class OutIter>
static
void generate(const char32_t c, OutIter& out);
/// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
/// might throw illegal_utf exception
static
IsNFC isNFC_quick_check(basic_string_view<CharT> s);
/// runs first quick check and a deep test if quick check returns "Maybe".
static
bool isNFC(basic_string_view<CharT> s);
/// returns true if the sequence is valid UTF-8
bool isUtf(const CharT* begin, const CharT* end);
/// converts a C++ string (in UTF-8/-16) into NFC form
static
std::basic_string<CharT> toNFC(basic_string_view<CharT> s);
/// calculates the number of "code units" in the target Unicode Transfer Format.
static
size_t utf_length(u32string_view s);
/// generates a whole u32string at once
static
std::basic_string<CharT> generate(const std::u32string& s);
/// creates an NFD u32string from UTF-8/UTF-16 input string s
static
std::u32string fromUtf_decompose(basic_string_view<CharT> s);
};
// scans the char sequences and parses UTF-8 sequences. Detect UTF-8 errors and throws exceptions.
uint32_t parseUtf8(const char*& c, const char* end);
using UTF8 = UTF<char>;
using UTF16 = UTF<char16_t>;
// converts 'c' into a UTF-8/UTF-16 sequence and adds that to 'ret'
template<class CharT>
void toUtf(const char32_t c, std::basic_string<CharT>& ret);
// throws illegal_utf8 exception if s is not valid UTF-8
void assert_utf8(string_view s);
// creates an NFD u32string from UTF-8/UTF-16 input string s
template<class CharT>
std::u32string fromUtf_decompose(basic_string_view<CharT> s);
// convert NFD to NFC
std::u32string createNFC(std::u32string nfd_string);
/*
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
// might throw illegal_utf exception
template<class CharT>
@ -78,6 +104,7 @@ bool isUtf8(const char* begin, const char* end);
// s is ''moved'' to the return value if possible so no copy is done here.
template<class CharT>
std::basic_string<CharT> toNFC(basic_string_view<CharT> s);
*/
// creates a UTF-8-encoded NFC string from s
std::string toNFC_8(u16string_view s);


+ 4
- 4
test/unittest_nfc.cc View File

@ -64,13 +64,13 @@ INSTANTIATE_TEST_SUITE_P(NfcTestInstance, NfcTest, testing::ValuesIn(testValues)
TEST_P( NfcTest, Meh )
{
const auto& v = GetParam();
EXPECT_EQ( v.quick, isNFC_quick_check(v.input) );
EXPECT_EQ( v.quick, UTF8::isNFC_quick_check(v.input) );
EXPECT_EQ( v.is_nfc, isNFC(v.input) );
EXPECT_EQ( v.nfc , toNFC(v.input) );
EXPECT_EQ( v.is_nfc, UTF8::isNFC(v.input) );
EXPECT_EQ( v.nfc , UTF8::toNFC(v.input) );
if(v.is_nfc)
{
EXPECT_EQ( v.input, toNFC(v.input) );
EXPECT_EQ( v.input, UTF8::toNFC(v.input) );
}
}

+ 4
- 4
test/unittest_nfc16.cc View File

@ -71,13 +71,13 @@ INSTANTIATE_TEST_SUITE_P(Nfc16TestInstance, Nfc16Test, testing::ValuesIn(testVal
TEST_P( Nfc16Test, Meh )
{
const auto& v = GetParam();
EXPECT_EQ( v.quick, isNFC_quick_check(v.input) );
EXPECT_EQ( v.quick, UTF16::isNFC_quick_check(v.input) );
EXPECT_EQ( v.is_nfc, isNFC(v.input) );
EXPECT_EQ( v.nfc , toNFC(v.input) );
EXPECT_EQ( v.is_nfc, UTF16::isNFC(v.input) );
EXPECT_EQ( v.nfc , UTF16::toNFC(v.input) );
if(v.is_nfc)
{
EXPECT_EQ( v.input, toNFC(v.input) );
EXPECT_EQ( v.input, UTF16::toNFC(v.input) );
}
}

Loading…
Cancel
Save