implement CanonicalDecompose according to Unicode standard
parent
9da47a8cd9
commit
f86b67545d
|
@ -86,7 +86,7 @@ $(TARGET)-static: main.o libjson-adapter.a
|
|||
$(TARGET_TEST): servertest.o libjson-adapter.a
|
||||
$(CXX) $(CPPFLAGS) $^ $(LDFLAGS) $(LDLIBS) -o $@
|
||||
|
||||
$(TARGET_GTEST): unittest_json.o unittest_nfc.o gtest-all.o gtest_main.o libjson-adapter.a
|
||||
$(TARGET_GTEST): unittest_json.o unittest_nfc.o unittest_decompose.o gtest-all.o gtest_main.o libjson-adapter.a
|
||||
$(CXX) $(CPPFLAGS) $^ $(LDFLAGS) $(LDLIBS) -o $@
|
||||
|
||||
libjson-adapter.a: $(ALL_OBJECTS)
|
||||
|
|
|
@ -115,9 +115,65 @@ namespace
|
|||
}
|
||||
}
|
||||
|
||||
std::pair<int,int> decompose(unsigned u)
|
||||
{
|
||||
const auto q = NFC_Decompose.find(u);
|
||||
if(q==NFC_Decompose.end())
|
||||
{
|
||||
return std::make_pair(-1, -1);
|
||||
}else{
|
||||
return q->second;
|
||||
}
|
||||
}
|
||||
|
||||
std::u32string decompose_full(unsigned u)
|
||||
{
|
||||
const std::pair<int,int> d = decompose(u);
|
||||
if(d.first<0)
|
||||
{
|
||||
return std::u32string( 1, char32_t(u) );
|
||||
}else{
|
||||
if(d.second<0)
|
||||
{
|
||||
return decompose_full(d.first);
|
||||
}
|
||||
}
|
||||
return decompose_full(d.first) + decompose_full(d.second);
|
||||
}
|
||||
|
||||
|
||||
// according to Unicode Standard, clause D108:
|
||||
bool isReorderablePair(unsigned a, unsigned b)
|
||||
{
|
||||
const unsigned cca = canonicalClass(a);
|
||||
const unsigned ccb = canonicalClass(b);
|
||||
|
||||
return (cca > ccb) && (ccb>0);
|
||||
}
|
||||
|
||||
// Unicode standard requires bubble sort, for stability reasons?
|
||||
void canonicalOrdering(std::u32string& us)
|
||||
{
|
||||
if(us.size()<2)
|
||||
return;
|
||||
|
||||
for(unsigned n=us.size(); n>1; --n)
|
||||
for(unsigned i=0; i<n-1; ++i)
|
||||
{
|
||||
char32_t& a = us[i];
|
||||
char32_t& b = us[i+1];
|
||||
if( isReorderablePair(a,b) )
|
||||
{
|
||||
std::swap(a,b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
|
||||
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
|
||||
{
|
||||
switch(is_nfc)
|
||||
|
@ -244,6 +300,23 @@ void assert_utf8(const std::string& s)
|
|||
}
|
||||
|
||||
|
||||
// creates a NFD string from s
|
||||
std::u32string fromUtf8_decompose(const std::string& s)
|
||||
{
|
||||
std::u32string u32s;
|
||||
u32s.reserve(s.size()*1.25);
|
||||
const char* begin = s.c_str();
|
||||
const char* end = s.c_str() + s.size();
|
||||
for(; begin<end; ++begin)
|
||||
{
|
||||
unsigned u = parseUtf8(begin, end);
|
||||
u32s += decompose_full(u);
|
||||
}
|
||||
canonicalOrdering(u32s); // works inplace.
|
||||
return u32s;
|
||||
}
|
||||
|
||||
|
||||
IsNFC isNFC_quick_check(const std::string& s)
|
||||
{
|
||||
const char* begin = s.data();
|
||||
|
@ -292,7 +365,7 @@ bool isNFC(const std::string& s)
|
|||
// s is ''moved'' to the return value if possible so no copy is done here.
|
||||
std::string toNFC(std::string s)
|
||||
{
|
||||
if(isNFC(s))
|
||||
if(isNFC_quick_check(s)==IsNFC::Yes)
|
||||
return s;
|
||||
|
||||
// TODO:
|
||||
|
|
|
@ -31,6 +31,9 @@ uint32_t parseUtf8(const char*& c, const char* end);
|
|||
// throws illegal_utf8 exception if s is not valid UTF-8
|
||||
void assert_utf8(const std::string& s);
|
||||
|
||||
// creates an NFD u32string from UTF-8 input string s
|
||||
std::u32string fromUtf8_decompose(const std::string& s);
|
||||
|
||||
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
|
||||
// might throw illegal_utf8 exception
|
||||
IsNFC isNFC_quick_check(const std::string& s);
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
#include <gtest/gtest.h>
|
||||
|
||||
#include "nfc.hh" // for illegal_utf8 exception
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
struct TestEntry
|
||||
{
|
||||
std::string input;
|
||||
std::u32string output;
|
||||
};
|
||||
|
||||
std::string uplus(const std::u32string& u32)
|
||||
{
|
||||
std::string ret; ret.reserve(u32.size()*6);
|
||||
char hexa[16];
|
||||
for(auto c:u32)
|
||||
{
|
||||
snprintf(hexa,15, "<U+%04X>", unsigned(c));
|
||||
ret += hexa;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, const TestEntry& tt)
|
||||
{
|
||||
return o << "input=«" << tt.input << "», output=«" << uplus(tt.output) << "» ";
|
||||
}
|
||||
|
||||
|
||||
const char nullo[4] = {0,0,0,0};
|
||||
const char32_t nullo32[4] = {0,0,0,0};
|
||||
|
||||
const std::vector<TestEntry> testValues =
|
||||
{
|
||||
{ "" , U"" }, // always start with the simple case ;-)
|
||||
{ "123" , U"123" }, // some ASCII digits. Still easy.
|
||||
{ "\n\\\b", U"\n\\\b" }, // backslash escapes for ASCII and control chars
|
||||
{ "ä" , U"a\u0308" }, // <U+00E4> small a with diaeresis -> decompose ä
|
||||
{ "\xc4\x85" , U"a\u0328" }, // <U+0105> small a with ogonek -> decompose ą
|
||||
|
||||
{ "a\xcc\x88", U"a\u0308" }, // a + <U+0308> combining diaresis
|
||||
{ "a\xcc\xa8", U"a\u0328" }, // a + <U+0328> combining ogonek
|
||||
{ "a\xcc\xa8\xcc\x88", U"a\u0328\u0308" }, // a + <U+0328> + <U+0308> ( ogonek + diaresis)
|
||||
{ "a\xcc\x88\xcc\xa8", U"a\u0328\u0308" }, // a + <U+0308> + <U+0328> ( diaeresis + ogonek) -> canonicalOrdering reorders the accents!
|
||||
|
||||
// ogonek sorts before diaeresis or breve-below:
|
||||
{ "ä\xcc\xa8ü\xcc\xa8ḫ\xcc\xa8", U"a\u0328\u0308u\u0328\u0308h\u0328\u032e"}, // ä + ogonek, ü + ogonek, h-breve-below + ogonek
|
||||
|
||||
{ "a\xcc\x85\xcc\xbc", U"a\u033c\u0305" }, // a + <U+0305> + <U+033C> ( overline + seagull_below) -> canonicalOrdering reorders the accents!
|
||||
{ "a\xcc\xbc\xcc\x85", U"a\u033c\u0305" }, // a + <U+033C> + <U+0305> ( seagull_below + overline)
|
||||
|
||||
{ std::string(nullo, nullo+1), std::u32string(nullo32, nullo32+1) }, // Yeah, 1 NUL byte
|
||||
{ std::string(nullo, nullo+4), std::u32string(nullo32, nullo32+4) }, // Yeah, 4 NUL bytes
|
||||
|
||||
{ "EOF", U"EOF" }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
class DecomposeTest : public ::testing::TestWithParam<TestEntry>
|
||||
{
|
||||
// intentionally left blank for now.
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DecomposeTestInstance, DecomposeTest, testing::ValuesIn(testValues) );
|
||||
|
||||
TEST_P( DecomposeTest, Meh )
|
||||
{
|
||||
const auto v = GetParam();
|
||||
EXPECT_EQ( v.output, fromUtf8_decompose(v.input) );
|
||||
}
|
Loading…
Reference in New Issue