implement parseUtf16().

master
roker 2 years ago
parent b9a0dde181
commit 1a9c3fc1a8

@ -29,66 +29,93 @@ namespace
return buf;
}
// hex string of a 16-bit value
std::string hex16(char16_t u)
{
char buf[16] = {0};
snprintf(buf, 15, "0x%04X", u);
return buf;
}
class utf8_exception
class utf_exception
{
public:
utf8_exception(uint8_t u) : octet(u) {}
virtual ~utf8_exception() = default;
utf_exception(uint16_t u) : octet(u), value(u) {}
virtual ~utf_exception() = default;
virtual std::string reason() const = 0;
uint8_t octet;
uint16_t value;
};
class cont_without_start : public utf8_exception
class cont_without_start : public utf_exception
{
public:
cont_without_start(uint8_t u) : utf8_exception(u) {}
cont_without_start(uint8_t u) : utf_exception(u) {}
std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
};
class overlong_sequence : public utf8_exception
class overlong_sequence : public utf_exception
{
public:
overlong_sequence(uint8_t octet, unsigned u) : utf8_exception(octet), unicode(u) {}
overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
unsigned unicode;
};
class unexpected_end : public utf8_exception
class unexpected_end : public utf_exception
{
public:
unexpected_end(uint8_t u) : utf8_exception(u) {}
unexpected_end(uint8_t u) : utf_exception(u) {}
std::string reason() const override { return "Unexpected end of string"; }
};
class surrogate : public utf8_exception
class surrogate : public utf_exception
{
public:
surrogate(uint8_t u, unsigned s) : utf8_exception(u), surr(s) {}
surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
private:
unsigned surr;
};
class no_unicode : public utf8_exception
class no_unicode : public utf_exception
{
public:
explicit no_unicode(uint8_t _octet) : utf8_exception(_octet) {}
explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
};
class too_big : public utf8_exception
class too_big : public utf_exception
{
public:
explicit too_big(uint8_t _octet, unsigned u) : utf8_exception(_octet), unicode(u) {}
explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
unsigned unicode;
};
class unexpected_low_surrogate : public utf_exception
{
public:
explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {}
std::string reason() const override { return "Unexpected low surogate " + hex16(value); }
};
class missing_low_surrogate : public utf_exception
{
public:
explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
private:
char16_t surr;
};
std::string escape(pEp::string_view s)
{
std::string ret; ret.reserve(s.size() + 16 );
@ -274,6 +301,33 @@ uint32_t parseUtf8(const char*& c, const char* end)
}
uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
{
while(c<end)
{
const char16_t u = *c;
if(u<0xD800 || u>=0xE000)
{
return u;
}else{
if(u>=0xDC00)
{
throw unexpected_low_surrogate(u);
}
++c;
if(c==end) throw unexpected_end(u);
const uint16_t low = *c;
if(low < 0xDC00 || low > 0xDFFF)
{
throw missing_low_surrogate(low, u);
}
return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
}
}
throw unexpected_end(-1);
}
void toUtf8(const char32_t c, std::string& ret)
{
if(c<=0x7F)
@ -333,7 +387,7 @@ void assert_utf8(string_view s)
++begin;
}
}
catch(const utf8_exception& e)
catch(const utf_exception& e)
{
throw illegal_utf8(s, begin - s.data(), e.reason());
}
@ -446,7 +500,7 @@ IsNFC isNFC_quick_check(string_view s)
last_cc = cc;
}
}
catch(const utf8_exception& e)
catch(const utf_exception& e)
{
throw illegal_utf8(s, begin - s.data(), e.reason());
}

Loading…
Cancel
Save