implement NFC composing. UNTESTED!

JSON-95
Roker 5 years ago
parent 900aac34cb
commit ab83a192b7

@ -270,6 +270,36 @@ uint32_t parseUtf8(const char*& c, const char* end)
}
std::string toUtf8(const std::u32string& u32)
{
std::string ret;
for(char32_t c : u32)
{
if(c<=0x7F)
{
ret += char(c);
}else if(c<=0x7FF)
{
ret += char( 0xC0 + (c>>6) );
ret += char( 0x80 + (c & 63));
}else if(c<=0xFFFF)
{
ret += char( 0xE0 + (c>>12) );
ret += char( 0x80 + ((c>>6) & 63));
ret += char( 0x80 + (c & 63));
}else if(c<=0x10FFFF)
{
ret += char( 0xF0 + (c>>18) );
ret += char( 0x80 + ((c>>12) & 63));
ret += char( 0x80 + ((c>>6) & 63));
ret += char( 0x80 + (c & 63));
}else{
throw too_big(0, c);
}
}
return ret;
}
illegal_utf8::illegal_utf8( const std::string& s, unsigned position, const std::string& reason)
: std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
@ -317,6 +347,71 @@ std::u32string fromUtf8_decompose(const std::string& s)
}
template<class Iter>
bool blocked(Iter L, Iter C)
{
Iter B = L; ++B;
for(;B!=C;++B)
{
if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
return true;
}
return false;
}
template<class Iter>
void combine(std::u32string& nfc, Iter starter, Iter next_starter)
{
Iter c = starter; ++c;
for(;c!=next_starter; ++c)
{
if(!blocked(starter, c))
{
auto q = NFC_Compose.find( std::make_pair<unsigned>(*starter,*c) );
if(q!=NFC_Compose.end())
{
*starter = q->second;
*c = -1;
}
}
}
// now add the remaining/changed characters to the NFC string:
for(Iter c = starter; c!=next_starter; ++c)
{
if( int(*c) >= 0)
{
nfc += *c;
}
}
}
// the nfd string is changed during composing process. So it works on a copy or call with std::move().
std::u32string createNFC(std::u32string nfd)
{
if(nfd.size()<=1)
return nfd;
std::u32string nfc;
nfc.reserve(nfd.size());
auto starter = nfd.begin();
while( starter != nfd.end() )
{
if( canonicalClass(*starter)!=0 )
{
nfc += *starter;
++starter;
}else{
auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
combine(nfc, starter, next_starter);
starter = next_starter;
}
}
return nfc;
}
IsNFC isNFC_quick_check(const std::string& s)
{
const char* begin = s.data();
@ -354,7 +449,7 @@ bool isNFC(const std::string& s)
case IsNFC::No : return false;
case IsNFC::Maybe:
{
throw std::logic_error("Deep NGC check is not yet implemented. Sorry.");
return s == toNFC(s); // very expensive!
}
}
@ -368,8 +463,7 @@ std::string toNFC(std::string s)
if(isNFC_quick_check(s)==IsNFC::Yes)
return s;
// TODO:
throw std::logic_error("NFC normalization is necessary, but unimplemented. Sorry.");
return toUtf8( createNFC( fromUtf8_decompose(s) ));
}

@ -34,6 +34,9 @@ void assert_utf8(const std::string& s);
// creates an NFD u32string from UTF-8 input string s
std::u32string fromUtf8_decompose(const std::string& s);
// convert NFD to NFC
std::u32string createNFC(std::u32string nfd_string);
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
// might throw illegal_utf8 exception
IsNFC isNFC_quick_check(const std::string& s);

Loading…
Cancel
Save