|
|
|
@ -29,66 +29,93 @@ namespace
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// hex string of a 16-bit value
|
|
|
|
|
std::string hex16(char16_t u)
|
|
|
|
|
{
|
|
|
|
|
char buf[16] = {0};
|
|
|
|
|
snprintf(buf, 15, "0x%04X", u);
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class utf8_exception
|
|
|
|
|
class utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
utf8_exception(uint8_t u) : octet(u) {}
|
|
|
|
|
virtual ~utf8_exception() = default;
|
|
|
|
|
utf_exception(uint16_t u) : octet(u), value(u) {}
|
|
|
|
|
virtual ~utf_exception() = default;
|
|
|
|
|
virtual std::string reason() const = 0;
|
|
|
|
|
uint8_t octet;
|
|
|
|
|
uint16_t value;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class cont_without_start : public utf8_exception
|
|
|
|
|
class cont_without_start : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
cont_without_start(uint8_t u) : utf8_exception(u) {}
|
|
|
|
|
cont_without_start(uint8_t u) : utf_exception(u) {}
|
|
|
|
|
std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class overlong_sequence : public utf8_exception
|
|
|
|
|
class overlong_sequence : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
overlong_sequence(uint8_t octet, unsigned u) : utf8_exception(octet), unicode(u) {}
|
|
|
|
|
overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
|
|
|
|
|
std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
|
|
|
|
|
unsigned unicode;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class unexpected_end : public utf8_exception
|
|
|
|
|
class unexpected_end : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
unexpected_end(uint8_t u) : utf8_exception(u) {}
|
|
|
|
|
unexpected_end(uint8_t u) : utf_exception(u) {}
|
|
|
|
|
std::string reason() const override { return "Unexpected end of string"; }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class surrogate : public utf8_exception
|
|
|
|
|
class surrogate : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
surrogate(uint8_t u, unsigned s) : utf8_exception(u), surr(s) {}
|
|
|
|
|
surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
|
|
|
|
|
std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
|
|
|
|
|
private:
|
|
|
|
|
unsigned surr;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class no_unicode : public utf8_exception
|
|
|
|
|
class no_unicode : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
explicit no_unicode(uint8_t _octet) : utf8_exception(_octet) {}
|
|
|
|
|
explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
|
|
|
|
|
std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class too_big : public utf8_exception
|
|
|
|
|
class too_big : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
explicit too_big(uint8_t _octet, unsigned u) : utf8_exception(_octet), unicode(u) {}
|
|
|
|
|
explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
|
|
|
|
|
std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
|
|
|
|
|
unsigned unicode;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class unexpected_low_surrogate : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {}
|
|
|
|
|
std::string reason() const override { return "Unexpected low surogate " + hex16(value); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class missing_low_surrogate : public utf_exception
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
|
|
|
|
|
std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
|
|
|
|
|
private:
|
|
|
|
|
char16_t surr;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string escape(pEp::string_view s)
|
|
|
|
|
{
|
|
|
|
|
std::string ret; ret.reserve(s.size() + 16 );
|
|
|
|
@ -274,6 +301,33 @@ uint32_t parseUtf8(const char*& c, const char* end)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
|
|
|
|
|
{
|
|
|
|
|
while(c<end)
|
|
|
|
|
{
|
|
|
|
|
const char16_t u = *c;
|
|
|
|
|
if(u<0xD800 || u>=0xE000)
|
|
|
|
|
{
|
|
|
|
|
return u;
|
|
|
|
|
}else{
|
|
|
|
|
if(u>=0xDC00)
|
|
|
|
|
{
|
|
|
|
|
throw unexpected_low_surrogate(u);
|
|
|
|
|
}
|
|
|
|
|
++c;
|
|
|
|
|
if(c==end) throw unexpected_end(u);
|
|
|
|
|
const uint16_t low = *c;
|
|
|
|
|
if(low < 0xDC00 || low > 0xDFFF)
|
|
|
|
|
{
|
|
|
|
|
throw missing_low_surrogate(low, u);
|
|
|
|
|
}
|
|
|
|
|
return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
throw unexpected_end(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void toUtf8(const char32_t c, std::string& ret)
|
|
|
|
|
{
|
|
|
|
|
if(c<=0x7F)
|
|
|
|
@ -333,7 +387,7 @@ void assert_utf8(string_view s)
|
|
|
|
|
++begin;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
catch(const utf8_exception& e)
|
|
|
|
|
catch(const utf_exception& e)
|
|
|
|
|
{
|
|
|
|
|
throw illegal_utf8(s, begin - s.data(), e.reason());
|
|
|
|
|
}
|
|
|
|
@ -446,7 +500,7 @@ IsNFC isNFC_quick_check(string_view s)
|
|
|
|
|
last_cc = cc;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
catch(const utf8_exception& e)
|
|
|
|
|
catch(const utf_exception& e)
|
|
|
|
|
{
|
|
|
|
|
throw illegal_utf8(s, begin - s.data(), e.reason());
|
|
|
|
|
}
|
|
|
|
|