176 lines
3.5 KiB
C++
176 lines
3.5 KiB
C++
// converts a C++ string into NFC form
|
|
|
|
#include "nfc.hh"
|
|
#include <cstdint>
|
|
|
|
|
|
namespace
|
|
{
|
|
|
|
class cont_without_start
|
|
{
|
|
public:
|
|
cont_without_start(uint8_t u) : octet(u) {}
|
|
uint8_t octet;
|
|
};
|
|
|
|
|
|
class overlong_sequence
|
|
{
|
|
public:
|
|
overlong_sequence(uint8_t u) : octet(u) {}
|
|
uint8_t octet;
|
|
};
|
|
|
|
class unexpected_end
|
|
{
|
|
public:
|
|
unexpected_end(uint8_t u) : octet(u) {}
|
|
uint8_t octet;
|
|
};
|
|
|
|
class no_unicode
|
|
{
|
|
public:
|
|
no_unicode(uint8_t u) : octet(u) {}
|
|
uint8_t octet;
|
|
};
|
|
|
|
|
|
std::string escape(const std::string& s)
|
|
{
|
|
std::string ret; ret.reserve(s.size() + 16 );
|
|
for(char c : s)
|
|
{
|
|
const uint8_t u = c;
|
|
if(u>=32 && u<=126)
|
|
{
|
|
ret += c;
|
|
}else{
|
|
char buf[16];
|
|
snprintf(buf,15, "«%02x»", u );
|
|
ret += buf;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
uint32_t getUni(const char*& c, const char* end)
|
|
{
|
|
while(c<end)
|
|
{
|
|
const uint8_t u = uint8_t(*c);
|
|
switch(u)
|
|
{
|
|
case 0x00 ... 0x7f : return u;
|
|
case 0x80 ... 0xBF : throw cont_without_start(u);
|
|
case 0xC0 ... 0xC1 : throw overlong_sequence(u);
|
|
case 0xC2 ... 0xDF : // 2 octet sequence
|
|
{
|
|
++c;
|
|
if(c==end) throw unexpected_end(u);
|
|
const uint8_t uu = uint8_t(*c);
|
|
if((uu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uu);
|
|
}
|
|
return ((u & 0x1F) << 6) + (uu & 0x3F);
|
|
}
|
|
case 0xE0 ... 0xEF : // 3 octet sequence
|
|
{
|
|
++c;
|
|
if(c==end) throw unexpected_end(u);
|
|
const uint8_t uu = uint8_t(*c);
|
|
if((uu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uu);
|
|
}
|
|
++c;
|
|
if(c==end) throw unexpected_end(uu);
|
|
const uint8_t uuu = uint8_t(*c);
|
|
if((uuu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uuu);
|
|
}
|
|
|
|
const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
|
|
if(ret<0x800) throw overlong_sequence(u);
|
|
return ret;
|
|
}
|
|
case 0xF0 ... 0xF4 : // 4 octet sequence
|
|
{
|
|
++c;
|
|
if(c==end) throw unexpected_end(u);
|
|
const uint8_t uu = uint8_t(*c);
|
|
if((uu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uu);
|
|
}
|
|
++c;
|
|
if(c==end) throw unexpected_end(uu);
|
|
const uint8_t uuu = uint8_t(*c);
|
|
if((uuu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uuu);
|
|
}
|
|
++c;
|
|
if(c==end) throw unexpected_end(uuu);
|
|
const uint8_t uuuu = uint8_t(*c);
|
|
if((uuuu & 0xC0) != 0x80)
|
|
{
|
|
throw unexpected_end(uuuu);
|
|
}
|
|
|
|
const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
|
|
if(ret<0x10000) throw overlong_sequence(u);
|
|
if(ret>0x10FFFF) throw no_unicode(u);
|
|
return ret;
|
|
}
|
|
default:
|
|
throw no_unicode(u);
|
|
}
|
|
|
|
}
|
|
|
|
throw unexpected_end(-1);
|
|
}
|
|
|
|
} // end of anonymous namespace
|
|
|
|
|
|
illegal_utf8::illegal_utf8( const std::string& s, unsigned position)
|
|
: std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + "." )
|
|
{}
|
|
|
|
|
|
illegal_utf8::illegal_utf8( const std::string& msg )
|
|
: std::runtime_error( msg )
|
|
{}
|
|
|
|
|
|
bool isNFC(const std::string& s)
|
|
{
|
|
const char* begin = s.data();
|
|
const char* const end = s.data() + s.size();
|
|
while(begin<end)
|
|
{
|
|
const uint32_t u = getUni(begin, end);
|
|
if(u>=0x300 && u<0x30A) return false; // That's bullshit. Use a better algorithm!
|
|
++begin;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// s is ''moved'' to the return value if possible so no copy is done here.
|
|
std::string toNFC(std::string s)
|
|
{
|
|
if(isNFC(s))
|
|
return std::move(s);
|
|
|
|
std::string ret;
|
|
|
|
// TODO:
|
|
|
|
return ret;
|
|
}
|