|
|
- // This file is under GNU General Public License 3.0
- // see LICENSE.txt
-
- // converts a C++ string into NFC form
-
- #include "nfc.hh"
- #include <cstdint>
- #include <set>
- #include <ostream>
- #include <algorithm>
-
- #include "nfc_sets.hh"
-
- namespace
- {
- // unicode to hex string
- std::string u2h(unsigned u)
- {
- char buf[16] = {0};
- snprintf(buf, 15, "<U+%04X>", u );
- return buf;
- }
-
- // octet to hex string
- std::string o2h(uint8_t octet)
- {
- char buf[16] = {0};
- snprintf(buf, 15, "0x%02hhX", octet);
- return buf;
- }
-
- // hex string of a 16-bit value
- std::string hex16(char16_t u)
- {
- char buf[16] = {0};
- snprintf(buf, 15, "0x%04X", u);
- return buf;
- }
-
-
- class utf_exception
- {
- public:
- utf_exception(uint16_t u) : octet(u), value(u) {}
- virtual ~utf_exception() = default;
- virtual std::string reason() const = 0;
- uint8_t octet;
- uint16_t value;
- };
-
-
- class cont_without_start : public utf_exception
- {
- public:
- cont_without_start(uint8_t u) : utf_exception(u) {}
- std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
- };
-
-
- class overlong_sequence : public utf_exception
- {
- public:
- overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
- std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
- unsigned unicode;
- };
-
-
- class unexpected_end : public utf_exception
- {
- public:
- unexpected_end(uint8_t u) : utf_exception(u) {}
- std::string reason() const override { return "Unexpected end of string"; }
- };
-
- class surrogate : public utf_exception
- {
- public:
- surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
- std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
- private:
- unsigned surr;
- };
-
- class no_unicode : public utf_exception
- {
- public:
- explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
- std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
- };
-
- class too_big : public utf_exception
- {
- public:
- explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
- std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
- unsigned unicode;
- };
-
-
- class unexpected_surrogate : public utf_exception
- {
- public:
- explicit unexpected_surrogate(char16_t c) : utf_exception(c) {}
- std::string reason() const override { return "Unexpected surogate " + hex16(value); }
- };
-
-
- class missing_low_surrogate : public utf_exception
- {
- public:
- explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
- std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
- private:
- char16_t surr;
- };
-
-
- std::string escape(pEp::string_view s)
- {
- std::string ret; ret.reserve(s.size() + 16 );
- for(char c : s)
- {
- const uint8_t u = c;
- if(u>=32 && u<=126)
- {
- ret += c;
- }else{
- char buf[16];
- snprintf(buf,15, "«%02x»", u );
- ret += buf;
- }
- }
- return ret;
- }
-
- std::string escape(pEp::u16string_view s)
- {
- std::string ret; ret.reserve(s.size() + 16 );
- for(char16_t c : s)
- {
- if(c>=32 && c<=126)
- {
- ret += char(c);
- }else{
- char buf[16];
- snprintf(buf,15, "«%04x»", c );
- ret += buf;
- }
- }
- return ret;
- }
-
- // returns the "CanonicalCombinincClass" of the given Unicode codpoint u
- unsigned canonicalClass(unsigned u)
- {
- const auto q = NFC_CombiningClass.find(u);
- if(q==NFC_CombiningClass.end())
- {
- return 0; // not found in map.
- }else{
- return q->second;
- }
- }
-
- std::pair<int,int> decompose(unsigned u)
- {
- const auto q = NFC_Decompose.find(u);
- if(q==NFC_Decompose.end())
- {
- return std::make_pair(-1, -1);
- }else{
- return q->second;
- }
- }
-
- std::u32string decompose_full(unsigned u)
- {
- const std::pair<int,int> d = decompose(u);
- if(d.first<0)
- {
- return std::u32string( 1, char32_t(u) );
- }else{
- if(d.second<0)
- {
- return decompose_full(d.first);
- }
- }
- return decompose_full(d.first) + decompose_full(d.second);
- }
-
-
- // according to Unicode Standard, clause D108:
- bool isReorderablePair(unsigned a, unsigned b)
- {
- const unsigned cca = canonicalClass(a);
- const unsigned ccb = canonicalClass(b);
-
- return (cca > ccb) && (ccb>0);
- }
-
- // Unicode standard requires bubble sort, for stability reasons?
- void canonicalOrdering(std::u32string& us)
- {
- if(us.size()<2)
- return;
-
- for(unsigned n=us.size(); n>1; --n)
- for(unsigned i=0; i<n-1; ++i)
- {
- char32_t& a = us[i];
- char32_t& b = us[i+1];
- if( isReorderablePair(a,b) )
- {
- std::swap(a,b);
- }
- }
- }
-
- } // end of anonymous namespace
-
-
- namespace pEp {
-
- std::string escape_utf16(u16string_view s)
- {
- return escape(s);
- }
-
-
- std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
- {
- switch(is_nfc)
- {
- case IsNFC::No : return o << "No";
- case IsNFC::Maybe : return o << "Maybe";
- case IsNFC::Yes : return o << "Yes";
- }
- throw std::logic_error("Unknown value of IsNFC");
- }
-
-
- uint32_t parseUtf8(const char*& c, const char* end)
- {
- while(c<end)
- {
- const uint8_t u = uint8_t(*c);
-
- if (u<=0x7f)
- {
- return u;
- } else if (u<=0xBF)
- {
- throw cont_without_start(u);
- } else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
- {
- throw no_unicode(u);
- } else if (u<=0xDF) // 2 octet sequence
- {
- ++c;
- if(c==end) throw unexpected_end(u);
- const uint8_t uu = uint8_t(*c);
- if((uu & 0xC0) != 0x80)
- {
- throw unexpected_end(uu);
- }
- return ((u & 0x1F) << 6) + (uu & 0x3F);
- } else if (u<=0xEF) // 3 octet sequence
- {
- ++c;
- if(c==end) throw unexpected_end(u);
- const uint8_t uu = uint8_t(*c);
- if((uu & 0xC0) != 0x80)
- {
- throw unexpected_end(uu);
- }
- ++c;
- if(c==end) throw unexpected_end(uu);
- const uint8_t uuu = uint8_t(*c);
- if((uuu & 0xC0) != 0x80)
- {
- throw unexpected_end(uuu);
- }
-
- const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
- if(ret<0x800) throw overlong_sequence(u, ret);
- if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret);
- return ret;
- } else if (u<=0xF4) // 4 octet sequence
- {
- ++c;
- if(c==end) throw unexpected_end(u);
- const uint8_t uu = uint8_t(*c);
- if((uu & 0xC0) != 0x80)
- {
- throw unexpected_end(uu);
- }
- ++c;
- if(c==end) throw unexpected_end(uu);
- const uint8_t uuu = uint8_t(*c);
- if((uuu & 0xC0) != 0x80)
- {
- throw unexpected_end(uuu);
- }
- ++c;
- if(c==end) throw unexpected_end(uuu);
- const uint8_t uuuu = uint8_t(*c);
- if((uuuu & 0xC0) != 0x80)
- {
- throw unexpected_end(uuuu);
- }
-
- const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
- if(ret<0x10000) throw overlong_sequence(u, ret);
- if(ret>0x10FFFF) throw too_big(u, ret);
- return ret;
- } else
- {
- throw no_unicode(u);
- }
- }
-
- throw unexpected_end(-1);
- }
-
-
- uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
- {
- while(c<end)
- {
- const char16_t u = *c;
- if(u<0xD800 || u>=0xE000)
- {
- return u;
- }else{
- if(u>=0xDC00)
- {
- throw unexpected_surrogate(u);
- }
- ++c;
- if(c==end) throw unexpected_end(u);
- const uint16_t low = *c;
- if(low < 0xDC00 || low > 0xDFFF)
- {
- throw missing_low_surrogate(low, u);
- }
- return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
- }
- }
- throw unexpected_end(-1);
- }
-
- template<class CharT>
- uint32_t parseUtf(const CharT*& c, const CharT* end);
-
- template<>
- inline
- uint32_t parseUtf<char>(const char*& c, const char* end)
- {
- return parseUtf8(c,end);
- }
-
- template<>
- inline
- uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end)
- {
- return parseUtf16(c,end);
- }
-
-
- template<>
- void toUtf<char>(const char32_t c, std::string& ret)
- {
- if(c<=0x7F)
- {
- ret += char(c);
- }else if(c<=0x7FF)
- {
- ret += char( 0xC0 + (c>>6) );
- ret += char( 0x80 + (c & 63));
- }else if(c<=0xFFFF)
- {
- ret += char( 0xE0 + (c>>12) );
- ret += char( 0x80 + ((c>>6) & 63));
- ret += char( 0x80 + (c & 63));
- }else if(c<=0x10FFFF)
- {
- ret += char( 0xF0 + (c>>18) );
- ret += char( 0x80 + ((c>>12) & 63));
- ret += char( 0x80 + ((c>>6) & 63));
- ret += char( 0x80 + (c & 63));
- }else{
- throw too_big(0, c);
- }
- }
-
- template<>
- void toUtf<char16_t>(const char32_t c, std::u16string& ret)
- {
- if(c <= 0xFFFF)
- {
- if(c>=0xD800 && c<=0xDFFF)
- {
- throw unexpected_surrogate(c);
- }else{
- ret += char16_t(c);
- }
- }else{ // surrogate pair
- if(c>0x10FFFF)
- {
- throw too_big(0, c);
- }else{
- const uint32_t c_reduced = c - 0x10000;
- ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
- ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
- }
- }
- }
-
- template<class CharT>
- std::basic_string<CharT> toUtf(const std::u32string& u32)
- {
- std::basic_string<CharT> ret;
- for(char32_t c : u32)
- {
- toUtf<CharT>(c, ret);
- }
- return ret;
- }
-
-
-
- illegal_utf::illegal_utf( string_view s, unsigned position, const std::string& reason)
- : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
- {}
-
- illegal_utf::illegal_utf( u16string_view s, unsigned position, const std::string& reason)
- : std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
- {}
-
-
- illegal_utf::illegal_utf( const std::string& msg )
- : std::runtime_error( msg )
- {}
-
-
- void assert_utf8(string_view s)
- {
- const char* begin = s.data();
- const char* const end = s.data() + s.size();
- try
- {
- while(begin<end)
- {
- parseUtf8(begin, end); // ignore the output
- ++begin;
- }
- }
- catch(const utf_exception& e)
- {
- throw illegal_utf(s, begin - s.data(), e.reason());
- }
- }
-
-
- // creates a NFD string from s
- template<class CharT>
- std::u32string fromUtf_decompose(basic_string_view<CharT> s)
- {
- std::u32string u32s;
- u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
- const CharT* begin = s.data();
- const CharT* end = s.data() + s.size();
- for(; begin<end; ++begin)
- {
- unsigned u = parseUtf(begin, end);
- u32s += decompose_full(u);
- }
- canonicalOrdering(u32s); // works inplace.
- return u32s;
- }
-
-
- template<class Iter>
- bool blocked(Iter L, Iter C)
- {
- Iter B = L; ++B;
- for(;B!=C;++B)
- {
- if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
- return true;
- }
- return false;
- }
-
-
- template<class Iter>
- void combine(std::u32string& nfc, Iter starter, Iter next_starter)
- {
- Iter c = starter; ++c;
- for(;c!=next_starter; ++c)
- {
- if(!blocked(starter, c))
- {
- const unsigned starter_u = *starter;
- const unsigned c_u = *c;
-
- auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) );
- if(q!=NFC_Compose.end())
- {
- *starter = q->second;
- *c = -1;
- }
- }
- }
-
- // now add the remaining/changed characters to the NFC string:
- for(Iter c = starter; c!=next_starter; ++c)
- {
- if( int(*c) >= 0)
- {
- nfc += *c;
- }
- }
- }
-
- // the nfd string is changed during composing process. So it works on a copy or call with std::move().
- std::u32string createNFC(std::u32string nfd)
- {
- if(nfd.size()<=1)
- return nfd;
-
- std::u32string nfc;
- nfc.reserve(nfd.size());
- auto starter = nfd.begin();
- while( starter != nfd.end() )
- {
- if( canonicalClass(*starter)!=0 )
- {
- nfc += *starter;
- ++starter;
- }else{
- auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
- combine(nfc, starter, next_starter);
- starter = next_starter;
- }
- }
- return nfc;
- }
-
-
- template<class CharT>
- IsNFC isNFC_quick_check(basic_string_view<CharT> s)
- {
- const CharT* begin = s.data();
- const CharT* const end = s.data() + s.size();
- try
- {
- unsigned last_cc = 0;
- while(begin<end)
- {
- const uint32_t u = parseUtf(begin, end);
- const unsigned cc = canonicalClass(u);
- if( (cc!=0) && (last_cc > cc) )
- {
- return IsNFC::No;
- }
- if(NFC_No.count(u)) return IsNFC::No;
- if(NFC_Maybe.count(u)) return IsNFC::Maybe;
- ++begin;
- last_cc = cc;
- }
- }
- catch(const utf_exception& e)
- {
- throw illegal_utf(s, begin - s.data(), e.reason());
- }
- return IsNFC::Yes;
- }
-
-
- template<class CharT>
- bool isNFC(basic_string_view<CharT> s)
- {
- switch( isNFC_quick_check(s) )
- {
- case IsNFC::Yes : return true;
- case IsNFC::No : return false;
- case IsNFC::Maybe:
- {
- return s == toNFC(s); // very expensive!
- }
- }
-
- throw -1; // could never happen, but compiler is too dumb to see this.
- }
-
-
- template bool isNFC<char>(string_view);
- template bool isNFC<char16_t>(u16string_view);
-
- // should be unecessary, but... well...
- template std::string toNFC<char>(string_view);
- template std::u16string toNFC<char16_t>(u16string_view);
-
-
- bool isUtf8(const char* begin, const char* end)
- try{
- for(; begin<end; ++begin)
- {
- (void)parseUtf8(begin, end);
- }
- return true;
- }catch(const illegal_utf&)
- {
- return false;
- }
-
-
- // s is ''moved'' to the return value if possible so no copy is done here.
- template<class CharT>
- std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
- {
- if(isNFC_quick_check(s)==IsNFC::Yes)
- return std::basic_string<CharT>{s};
-
- return toUtf<CharT>( createNFC( fromUtf_decompose(s) ));
- }
-
-
- // used only to initialize the NFC Compose mapping:
- std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
- {
- std::map< std::pair<unsigned, unsigned>, unsigned> m;
- for(const auto& decomp : NFC_Decompose)
- {
- if(decomp.second.second >= 0) // skip singleton decompositions
- {
- m[ decomp.second ] = decomp.first;
- }
- }
-
- return m;
- }
-
- } // end of namespace pEp
|