From 8205b7b9deac4b0edca9b2a383aaa31d86a33411 Mon Sep 17 00:00:00 2001 From: roker Date: Fri, 1 Oct 2021 14:39:58 +0200 Subject: [PATCH] bundle UTF handling functions as static member functions of a UTF class template to circumvent prohibition of partial function specialization. *sigh* --- src/nfc.cc | 121 +++++++++++++++++++++++------------------ src/nfc.hh | 45 ++++++++++++--- test/unittest_nfc.cc | 8 +-- test/unittest_nfc16.cc | 8 +-- 4 files changed, 113 insertions(+), 69 deletions(-) diff --git a/src/nfc.cc b/src/nfc.cc index 85bcbae..c7885f5 100644 --- a/src/nfc.cc +++ b/src/nfc.cc @@ -352,52 +352,50 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) throw unexpected_end(-1); } -template -uint32_t parseUtf(const CharT*& c, const CharT* end); template<> -inline -uint32_t parseUtf(const char*& c, const char* end) +uint32_t UTF::parse(const char*& c, const char* end) { return parseUtf8(c,end); } template<> -inline -uint32_t parseUtf(const char16_t*& c, const char16_t* end) +uint32_t UTF::parse(const char16_t*& c, const char16_t* end) { return parseUtf16(c,end); } - template<> -void toUtf(const char32_t c, std::string& ret) +template +void UTF::generate(const char32_t c, OutIter& out) { if(c<=0x7F) { - ret += char(c); + *out++ = char(c); }else if(c<=0x7FF) { - ret += char( 0xC0 + (c>>6) ); - ret += char( 0x80 + (c & 63)); + *out++ = char( 0xC0 + (c>>6) ); + *out++ = char( 0x80 + (c & 63)); }else if(c<=0xFFFF) { - ret += char( 0xE0 + (c>>12) ); - ret += char( 0x80 + ((c>>6) & 63)); - ret += char( 0x80 + (c & 63)); + *out++ = char( 0xE0 + (c>>12) ); + *out++ = char( 0x80 + ((c>>6) & 63)); + *out++ = char( 0x80 + (c & 63)); }else if(c<=0x10FFFF) { - ret += char( 0xF0 + (c>>18) ); - ret += char( 0x80 + ((c>>12) & 63)); - ret += char( 0x80 + ((c>>6) & 63)); - ret += char( 0x80 + (c & 63)); + *out++ = char( 0xF0 + (c>>18) ); + *out++ = char( 0x80 + ((c>>12) & 63)); + *out++ = char( 0x80 + ((c>>6) & 63)); + *out++ = char( 0x80 + (c & 63)); }else{ throw too_big(0, c); } } + template<> -void toUtf(const char32_t c, std::u16string& ret) +template +void UTF::generate(const char32_t c, OutIter& out) { if(c <= 0xFFFF) { @@ -405,7 +403,7 @@ void toUtf(const char32_t c, std::u16string& ret) { throw unexpected_surrogate(c); }else{ - ret += char16_t(c); + *out++ = char16_t(c); } }else{ // surrogate pair if(c>0x10FFFF) @@ -413,19 +411,20 @@ void toUtf(const char32_t c, std::u16string& ret) throw too_big(0, c); }else{ const uint32_t c_reduced = c - 0x10000; - ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate - ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate + *out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate + *out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate } } } template -std::basic_string toUtf(const std::u32string& u32) +std::basic_string UTF::generate(const std::u32string& u32) { std::basic_string ret; + auto out = std::back_inserter(ret); for(char32_t c : u32) { - toUtf(c, ret); + generate(c, out); } return ret; } @@ -454,7 +453,7 @@ void assert_utf8(string_view s) { while(begin -std::u32string fromUtf_decompose(basic_string_view s) +std::u32string UTF::fromUtf_decompose(basic_string_view s) { std::u32string u32s; u32s.reserve( static_cast(s.size()*1.25) ); @@ -475,7 +474,7 @@ std::u32string fromUtf_decompose(basic_string_view s) const CharT* end = s.data() + s.size(); for(; begin -IsNFC isNFC_quick_check(basic_string_view s) +IsNFC UTF::isNFC_quick_check(basic_string_view s) { const CharT* begin = s.data(); const CharT* const end = s.data() + s.size(); @@ -561,7 +560,7 @@ IsNFC isNFC_quick_check(basic_string_view s) unsigned last_cc = 0; while(begin cc) ) { @@ -582,7 +581,7 @@ IsNFC isNFC_quick_check(basic_string_view s) template -bool isNFC(basic_string_view s) +bool UTF::isNFC(basic_string_view s) { switch( isNFC_quick_check(s) ) { @@ -598,19 +597,13 @@ bool isNFC(basic_string_view s) } -template bool isNFC(string_view); -template bool isNFC(u16string_view); - -// should be unecessary, but... well... -template std::string toNFC(string_view); -template std::u16string toNFC(u16string_view); - -bool isUtf8(const char* begin, const char* end) +template<> +bool UTF::isUtf(const char* begin, const char* end) try{ for(; begin -std::basic_string toNFC(basic_string_view s) +std::basic_string UTF::toNFC(basic_string_view s) { if(isNFC_quick_check(s)==IsNFC::Yes) return std::basic_string{s}; - return toUtf( createNFC( fromUtf_decompose(s) )); + return generate( createNFC( fromUtf_decompose(s) )); +} + + +template<> +size_t UTF::utf_length(u32string_view s) +{ + size_t len = 0; + for(const char32_t c : s) + { + if(c <= 0x7f) + { + len += 1; + }else if(c<=0x7ff) + { + len += 2; + }else if(c<=0xffff) + { + len += 3; + }else if(c<=0x10ffff) + { + len += 4; + }else{ + throw too_big(0, c); + } + } + + return len; } @@ -634,19 +654,17 @@ std::basic_string toNFC(basic_string_view s) // and unecessary temporary std::string etc. char* strdup_NFC(string_view s) { - if(isNFC_quick_check(s)==IsNFC::Yes) + if(UTF8::isNFC_quick_check(s)==IsNFC::Yes) return ::new_string(s.data(), s.size()); // implement the hard way more efficient -/********** FIXME: need more re-work, so I'll do the dumb way first - - const std::u32string& u32 = createNFC( fromUtf_decompose(s) ); - const size_t out_len = utf8len(u32); + const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) ); + const size_t out_len = UTF8::utf_length(u32); char* ret = ::new_string(nullptr, out_len ); char* iter{ret}; for(const char32_t c : u32) { - toUtf(c, iter); + UTF8::generate(c, iter); } if(iter > ret+out_len) // should never happen. ;) @@ -655,14 +673,13 @@ char* strdup_NFC(string_view s) } return ret; -********************/ - - // Correct but inefficient: - const std::string ret = toNFC(s); - return ::new_string(ret.data(), 0); } +template class UTF; +template class UTF; + + // used only to initialize the NFC Compose mapping: std::map< std::pair, unsigned> generate_nfc_compose() { diff --git a/src/nfc.hh b/src/nfc.hh index d485e37..5e8eb0c 100644 --- a/src/nfc.hh +++ b/src/nfc.hh @@ -38,30 +38,56 @@ class UTF public: /// parses a sequence of input code units into one Unicode code point and updates the input iterator c. /// \todo change to iterator templates? + static uint32_t parse(const CharT*& c, const CharT* end); - /// generates a UTF sequence from a given Unicode code point + /// generates a UTF sequence from a given Unicode code point. template + static void generate(const char32_t c, OutIter& out); + + + /// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe" + /// might throw illegal_utf exception + static + IsNFC isNFC_quick_check(basic_string_view s); + + /// runs first quick check and a deep test if quick check returns "Maybe". + static + bool isNFC(basic_string_view s); + + /// returns true if the sequence is valid UTF-8 + bool isUtf(const CharT* begin, const CharT* end); + + /// converts a C++ string (in UTF-8/-16) into NFC form + static + std::basic_string toNFC(basic_string_view s); + + /// calculates the number of "code units" in the target Unicode Transfer Format. + static + size_t utf_length(u32string_view s); + + /// generates a whole u32string at once + static + std::basic_string generate(const std::u32string& s); + + /// creates an NFD u32string from UTF-8/UTF-16 input string s + static + std::u32string fromUtf_decompose(basic_string_view s); }; -// scans the char sequences and parses UTF-8 sequences. Detect UTF-8 errors and throws exceptions. -uint32_t parseUtf8(const char*& c, const char* end); +using UTF8 = UTF; +using UTF16 = UTF; -// converts 'c' into a UTF-8/UTF-16 sequence and adds that to 'ret' -template -void toUtf(const char32_t c, std::basic_string& ret); // throws illegal_utf8 exception if s is not valid UTF-8 void assert_utf8(string_view s); -// creates an NFD u32string from UTF-8/UTF-16 input string s -template -std::u32string fromUtf_decompose(basic_string_view s); // convert NFD to NFC std::u32string createNFC(std::u32string nfd_string); +/* // return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe" // might throw illegal_utf exception template @@ -78,6 +104,7 @@ bool isUtf8(const char* begin, const char* end); // s is ''moved'' to the return value if possible so no copy is done here. template std::basic_string toNFC(basic_string_view s); +*/ // creates a UTF-8-encoded NFC string from s std::string toNFC_8(u16string_view s); diff --git a/test/unittest_nfc.cc b/test/unittest_nfc.cc index ee1927a..b27eb5d 100644 --- a/test/unittest_nfc.cc +++ b/test/unittest_nfc.cc @@ -64,13 +64,13 @@ INSTANTIATE_TEST_SUITE_P(NfcTestInstance, NfcTest, testing::ValuesIn(testValues) TEST_P( NfcTest, Meh ) { const auto& v = GetParam(); - EXPECT_EQ( v.quick, isNFC_quick_check(v.input) ); + EXPECT_EQ( v.quick, UTF8::isNFC_quick_check(v.input) ); - EXPECT_EQ( v.is_nfc, isNFC(v.input) ); - EXPECT_EQ( v.nfc , toNFC(v.input) ); + EXPECT_EQ( v.is_nfc, UTF8::isNFC(v.input) ); + EXPECT_EQ( v.nfc , UTF8::toNFC(v.input) ); if(v.is_nfc) { - EXPECT_EQ( v.input, toNFC(v.input) ); + EXPECT_EQ( v.input, UTF8::toNFC(v.input) ); } } diff --git a/test/unittest_nfc16.cc b/test/unittest_nfc16.cc index dfc9121..cd5e995 100644 --- a/test/unittest_nfc16.cc +++ b/test/unittest_nfc16.cc @@ -71,13 +71,13 @@ INSTANTIATE_TEST_SUITE_P(Nfc16TestInstance, Nfc16Test, testing::ValuesIn(testVal TEST_P( Nfc16Test, Meh ) { const auto& v = GetParam(); - EXPECT_EQ( v.quick, isNFC_quick_check(v.input) ); + EXPECT_EQ( v.quick, UTF16::isNFC_quick_check(v.input) ); - EXPECT_EQ( v.is_nfc, isNFC(v.input) ); - EXPECT_EQ( v.nfc , toNFC(v.input) ); + EXPECT_EQ( v.is_nfc, UTF16::isNFC(v.input) ); + EXPECT_EQ( v.nfc , UTF16::toNFC(v.input) ); if(v.is_nfc) { - EXPECT_EQ( v.input, toNFC(v.input) ); + EXPECT_EQ( v.input, UTF16::toNFC(v.input) ); } }