|
|
|
@ -352,52 +352,50 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
|
|
|
|
|
throw unexpected_end(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<class CharT>
|
|
|
|
|
uint32_t parseUtf(const CharT*& c, const CharT* end);
|
|
|
|
|
|
|
|
|
|
template<>
|
|
|
|
|
inline
|
|
|
|
|
uint32_t parseUtf<char>(const char*& c, const char* end)
|
|
|
|
|
uint32_t UTF<char>::parse(const char*& c, const char* end)
|
|
|
|
|
{
|
|
|
|
|
return parseUtf8(c,end);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<>
|
|
|
|
|
inline
|
|
|
|
|
uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end)
|
|
|
|
|
uint32_t UTF<char16_t>::parse(const char16_t*& c, const char16_t* end)
|
|
|
|
|
{
|
|
|
|
|
return parseUtf16(c,end);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<>
|
|
|
|
|
void toUtf<char>(const char32_t c, std::string& ret)
|
|
|
|
|
template<class OutIter>
|
|
|
|
|
void UTF<char>::generate(const char32_t c, OutIter& out)
|
|
|
|
|
{
|
|
|
|
|
if(c<=0x7F)
|
|
|
|
|
{
|
|
|
|
|
ret += char(c);
|
|
|
|
|
*out++ = char(c);
|
|
|
|
|
}else if(c<=0x7FF)
|
|
|
|
|
{
|
|
|
|
|
ret += char( 0xC0 + (c>>6) );
|
|
|
|
|
ret += char( 0x80 + (c & 63));
|
|
|
|
|
*out++ = char( 0xC0 + (c>>6) );
|
|
|
|
|
*out++ = char( 0x80 + (c & 63));
|
|
|
|
|
}else if(c<=0xFFFF)
|
|
|
|
|
{
|
|
|
|
|
ret += char( 0xE0 + (c>>12) );
|
|
|
|
|
ret += char( 0x80 + ((c>>6) & 63));
|
|
|
|
|
ret += char( 0x80 + (c & 63));
|
|
|
|
|
*out++ = char( 0xE0 + (c>>12) );
|
|
|
|
|
*out++ = char( 0x80 + ((c>>6) & 63));
|
|
|
|
|
*out++ = char( 0x80 + (c & 63));
|
|
|
|
|
}else if(c<=0x10FFFF)
|
|
|
|
|
{
|
|
|
|
|
ret += char( 0xF0 + (c>>18) );
|
|
|
|
|
ret += char( 0x80 + ((c>>12) & 63));
|
|
|
|
|
ret += char( 0x80 + ((c>>6) & 63));
|
|
|
|
|
ret += char( 0x80 + (c & 63));
|
|
|
|
|
*out++ = char( 0xF0 + (c>>18) );
|
|
|
|
|
*out++ = char( 0x80 + ((c>>12) & 63));
|
|
|
|
|
*out++ = char( 0x80 + ((c>>6) & 63));
|
|
|
|
|
*out++ = char( 0x80 + (c & 63));
|
|
|
|
|
}else{
|
|
|
|
|
throw too_big(0, c);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<>
|
|
|
|
|
void toUtf<char16_t>(const char32_t c, std::u16string& ret)
|
|
|
|
|
template<class OutIter>
|
|
|
|
|
void UTF<char16_t>::generate(const char32_t c, OutIter& out)
|
|
|
|
|
{
|
|
|
|
|
if(c <= 0xFFFF)
|
|
|
|
|
{
|
|
|
|
@ -405,7 +403,7 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret)
|
|
|
|
|
{
|
|
|
|
|
throw unexpected_surrogate(c);
|
|
|
|
|
}else{
|
|
|
|
|
ret += char16_t(c);
|
|
|
|
|
*out++ = char16_t(c);
|
|
|
|
|
}
|
|
|
|
|
}else{ // surrogate pair
|
|
|
|
|
if(c>0x10FFFF)
|
|
|
|
@ -413,19 +411,20 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret)
|
|
|
|
|
throw too_big(0, c);
|
|
|
|
|
}else{
|
|
|
|
|
const uint32_t c_reduced = c - 0x10000;
|
|
|
|
|
ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
|
|
|
|
|
ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
|
|
|
|
|
*out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
|
|
|
|
|
*out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<class CharT>
|
|
|
|
|
std::basic_string<CharT> toUtf(const std::u32string& u32)
|
|
|
|
|
std::basic_string<CharT> UTF<CharT>::generate(const std::u32string& u32)
|
|
|
|
|
{
|
|
|
|
|
std::basic_string<CharT> ret;
|
|
|
|
|
auto out = std::back_inserter(ret);
|
|
|
|
|
for(char32_t c : u32)
|
|
|
|
|
{
|
|
|
|
|
toUtf<CharT>(c, ret);
|
|
|
|
|
generate(c, out);
|
|
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
@ -454,7 +453,7 @@ void assert_utf8(string_view s)
|
|
|
|
|
{
|
|
|
|
|
while(begin<end)
|
|
|
|
|
{
|
|
|
|
|
parseUtf8(begin, end); // ignore the output
|
|
|
|
|
UTF8::parse(begin, end); // ignore the output
|
|
|
|
|
++begin;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -467,7 +466,7 @@ void assert_utf8(string_view s)
|
|
|
|
|
|
|
|
|
|
// creates a NFD string from s
|
|
|
|
|
template<class CharT>
|
|
|
|
|
std::u32string fromUtf_decompose(basic_string_view<CharT> s)
|
|
|
|
|
std::u32string UTF<CharT>::fromUtf_decompose(basic_string_view<CharT> s)
|
|
|
|
|
{
|
|
|
|
|
std::u32string u32s;
|
|
|
|
|
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
|
|
|
|
@ -475,7 +474,7 @@ std::u32string fromUtf_decompose(basic_string_view<CharT> s)
|
|
|
|
|
const CharT* end = s.data() + s.size();
|
|
|
|
|
for(; begin<end; ++begin)
|
|
|
|
|
{
|
|
|
|
|
unsigned u = parseUtf(begin, end);
|
|
|
|
|
unsigned u = parse(begin, end);
|
|
|
|
|
u32s += decompose_full(u);
|
|
|
|
|
}
|
|
|
|
|
canonicalOrdering(u32s); // works inplace.
|
|
|
|
@ -552,7 +551,7 @@ std::u32string createNFC(std::u32string nfd)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<class CharT>
|
|
|
|
|
IsNFC isNFC_quick_check(basic_string_view<CharT> s)
|
|
|
|
|
IsNFC UTF<CharT>::isNFC_quick_check(basic_string_view<CharT> s)
|
|
|
|
|
{
|
|
|
|
|
const CharT* begin = s.data();
|
|
|
|
|
const CharT* const end = s.data() + s.size();
|
|
|
|
@ -561,7 +560,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s)
|
|
|
|
|
unsigned last_cc = 0;
|
|
|
|
|
while(begin<end)
|
|
|
|
|
{
|
|
|
|
|
const uint32_t u = parseUtf(begin, end);
|
|
|
|
|
const uint32_t u = parse(begin, end);
|
|
|
|
|
const unsigned cc = canonicalClass(u);
|
|
|
|
|
if( (cc!=0) && (last_cc > cc) )
|
|
|
|
|
{
|
|
|
|
@ -582,7 +581,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<class CharT>
|
|
|
|
|
bool isNFC(basic_string_view<CharT> s)
|
|
|
|
|
bool UTF<CharT>::isNFC(basic_string_view<CharT> s)
|
|
|
|
|
{
|
|
|
|
|
switch( isNFC_quick_check(s) )
|
|
|
|
|
{
|
|
|
|
@ -598,19 +597,13 @@ bool isNFC(basic_string_view<CharT> s)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template bool isNFC<char>(string_view);
|
|
|
|
|
template bool isNFC<char16_t>(u16string_view);
|
|
|
|
|
|
|
|
|
|
// should be unecessary, but... well...
|
|
|
|
|
template std::string toNFC<char>(string_view);
|
|
|
|
|
template std::u16string toNFC<char16_t>(u16string_view);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isUtf8(const char* begin, const char* end)
|
|
|
|
|
template<>
|
|
|
|
|
bool UTF<char>::isUtf(const char* begin, const char* end)
|
|
|
|
|
try{
|
|
|
|
|
for(; begin<end; ++begin)
|
|
|
|
|
{
|
|
|
|
|
(void)parseUtf8(begin, end);
|
|
|
|
|
(void)parse(begin, end);
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}catch(const illegal_utf&)
|
|
|
|
@ -621,12 +614,39 @@ try{
|
|
|
|
|
|
|
|
|
|
// s is ''moved'' to the return value if possible so no copy is done here.
|
|
|
|
|
template<class CharT>
|
|
|
|
|
std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
|
|
|
|
|
std::basic_string<CharT> UTF<CharT>::toNFC(basic_string_view<CharT> s)
|
|
|
|
|
{
|
|
|
|
|
if(isNFC_quick_check(s)==IsNFC::Yes)
|
|
|
|
|
return std::basic_string<CharT>{s};
|
|
|
|
|
|
|
|
|
|
return toUtf<CharT>( createNFC( fromUtf_decompose(s) ));
|
|
|
|
|
return generate( createNFC( fromUtf_decompose(s) ));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<>
|
|
|
|
|
size_t UTF<char>::utf_length(u32string_view s)
|
|
|
|
|
{
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
for(const char32_t c : s)
|
|
|
|
|
{
|
|
|
|
|
if(c <= 0x7f)
|
|
|
|
|
{
|
|
|
|
|
len += 1;
|
|
|
|
|
}else if(c<=0x7ff)
|
|
|
|
|
{
|
|
|
|
|
len += 2;
|
|
|
|
|
}else if(c<=0xffff)
|
|
|
|
|
{
|
|
|
|
|
len += 3;
|
|
|
|
|
}else if(c<=0x10ffff)
|
|
|
|
|
{
|
|
|
|
|
len += 4;
|
|
|
|
|
}else{
|
|
|
|
|
throw too_big(0, c);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -634,19 +654,17 @@ std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
|
|
|
|
|
// and unecessary temporary std::string etc.
|
|
|
|
|
char* strdup_NFC(string_view s)
|
|
|
|
|
{
|
|
|
|
|
if(isNFC_quick_check(s)==IsNFC::Yes)
|
|
|
|
|
if(UTF8::isNFC_quick_check(s)==IsNFC::Yes)
|
|
|
|
|
return ::new_string(s.data(), s.size());
|
|
|
|
|
|
|
|
|
|
// implement the hard way more efficient
|
|
|
|
|
/********** FIXME: need more re-work, so I'll do the dumb way first
|
|
|
|
|
|
|
|
|
|
const std::u32string& u32 = createNFC( fromUtf_decompose(s) );
|
|
|
|
|
const size_t out_len = utf8len(u32);
|
|
|
|
|
const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) );
|
|
|
|
|
const size_t out_len = UTF8::utf_length(u32);
|
|
|
|
|
char* ret = ::new_string(nullptr, out_len );
|
|
|
|
|
char* iter{ret};
|
|
|
|
|
for(const char32_t c : u32)
|
|
|
|
|
{
|
|
|
|
|
toUtf<char, char*>(c, iter);
|
|
|
|
|
UTF8::generate(c, iter);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(iter > ret+out_len) // should never happen. ;)
|
|
|
|
@ -655,14 +673,13 @@ char* strdup_NFC(string_view s)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
********************/
|
|
|
|
|
|
|
|
|
|
// Correct but inefficient:
|
|
|
|
|
const std::string ret = toNFC<char>(s);
|
|
|
|
|
return ::new_string(ret.data(), 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template class UTF<char>;
|
|
|
|
|
template class UTF<char16_t>;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// used only to initialize the NFC Compose mapping:
|
|
|
|
|
std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
|
|
|
|
|
{
|
|
|
|
|