A C++ wrapper for the basic C datatypes defined by the pEpEngine.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

563 lines
12 KiB

  1. // This file is under GNU General Public License 3.0
  2. // see LICENSE.txt
  3. // converts a C++ string into NFC form
  4. #include "nfc.hh"
  5. #include <cstdint>
  6. #include <set>
  7. #include <ostream>
  8. #include <algorithm>
  9. #include "nfc_sets.hh"
  10. namespace
  11. {
  12. // unicode to hex string
  13. std::string u2h(unsigned u)
  14. {
  15. char buf[16] = {0};
  16. snprintf(buf, 15, "<U+%04X>", u );
  17. return buf;
  18. }
  19. // octet to hex string
  20. std::string o2h(uint8_t octet)
  21. {
  22. char buf[16] = {0};
  23. snprintf(buf, 15, "0x%02hhX", octet);
  24. return buf;
  25. }
  26. // hex string of a 16-bit value
  27. std::string hex16(char16_t u)
  28. {
  29. char buf[16] = {0};
  30. snprintf(buf, 15, "0x%04X", u);
  31. return buf;
  32. }
  33. class utf_exception
  34. {
  35. public:
  36. utf_exception(uint16_t u) : octet(u), value(u) {}
  37. virtual ~utf_exception() = default;
  38. virtual std::string reason() const = 0;
  39. uint8_t octet;
  40. uint16_t value;
  41. };
  42. class cont_without_start : public utf_exception
  43. {
  44. public:
  45. cont_without_start(uint8_t u) : utf_exception(u) {}
  46. std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
  47. };
  48. class overlong_sequence : public utf_exception
  49. {
  50. public:
  51. overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
  52. std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
  53. unsigned unicode;
  54. };
  55. class unexpected_end : public utf_exception
  56. {
  57. public:
  58. unexpected_end(uint8_t u) : utf_exception(u) {}
  59. std::string reason() const override { return "Unexpected end of string"; }
  60. };
  61. class surrogate : public utf_exception
  62. {
  63. public:
  64. surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
  65. std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
  66. private:
  67. unsigned surr;
  68. };
  69. class no_unicode : public utf_exception
  70. {
  71. public:
  72. explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
  73. std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
  74. };
  75. class too_big : public utf_exception
  76. {
  77. public:
  78. explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
  79. std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
  80. unsigned unicode;
  81. };
  82. class unexpected_low_surrogate : public utf_exception
  83. {
  84. public:
  85. explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {}
  86. std::string reason() const override { return "Unexpected low surogate " + hex16(value); }
  87. };
  88. class missing_low_surrogate : public utf_exception
  89. {
  90. public:
  91. explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
  92. std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
  93. private:
  94. char16_t surr;
  95. };
  96. std::string escape(pEp::string_view s)
  97. {
  98. std::string ret; ret.reserve(s.size() + 16 );
  99. for(char c : s)
  100. {
  101. const uint8_t u = c;
  102. if(u>=32 && u<=126)
  103. {
  104. ret += c;
  105. }else{
  106. char buf[16];
  107. snprintf(buf,15, "«%02x»", u );
  108. ret += buf;
  109. }
  110. }
  111. return ret;
  112. }
  113. // returns the "CanonicalCombinincClass" of the given Unicode codpoint u
  114. unsigned canonicalClass(unsigned u)
  115. {
  116. const auto q = NFC_CombiningClass.find(u);
  117. if(q==NFC_CombiningClass.end())
  118. {
  119. return 0; // not found in map.
  120. }else{
  121. return q->second;
  122. }
  123. }
  124. std::pair<int,int> decompose(unsigned u)
  125. {
  126. const auto q = NFC_Decompose.find(u);
  127. if(q==NFC_Decompose.end())
  128. {
  129. return std::make_pair(-1, -1);
  130. }else{
  131. return q->second;
  132. }
  133. }
  134. std::u32string decompose_full(unsigned u)
  135. {
  136. const std::pair<int,int> d = decompose(u);
  137. if(d.first<0)
  138. {
  139. return std::u32string( 1, char32_t(u) );
  140. }else{
  141. if(d.second<0)
  142. {
  143. return decompose_full(d.first);
  144. }
  145. }
  146. return decompose_full(d.first) + decompose_full(d.second);
  147. }
  148. // according to Unicode Standard, clause D108:
  149. bool isReorderablePair(unsigned a, unsigned b)
  150. {
  151. const unsigned cca = canonicalClass(a);
  152. const unsigned ccb = canonicalClass(b);
  153. return (cca > ccb) && (ccb>0);
  154. }
  155. // Unicode standard requires bubble sort, for stability reasons?
  156. void canonicalOrdering(std::u32string& us)
  157. {
  158. if(us.size()<2)
  159. return;
  160. for(unsigned n=us.size(); n>1; --n)
  161. for(unsigned i=0; i<n-1; ++i)
  162. {
  163. char32_t& a = us[i];
  164. char32_t& b = us[i+1];
  165. if( isReorderablePair(a,b) )
  166. {
  167. std::swap(a,b);
  168. }
  169. }
  170. }
  171. } // end of anonymous namespace
  172. namespace pEp {
  173. std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
  174. {
  175. switch(is_nfc)
  176. {
  177. case IsNFC::No : return o << "No";
  178. case IsNFC::Maybe : return o << "Maybe";
  179. case IsNFC::Yes : return o << "Yes";
  180. }
  181. throw std::logic_error("Unknown value of IsNFC");
  182. }
  183. uint32_t parseUtf8(const char*& c, const char* end)
  184. {
  185. while(c<end)
  186. {
  187. const uint8_t u = uint8_t(*c);
  188. if (u<=0x7f)
  189. {
  190. return u;
  191. } else if (u<=0xBF)
  192. {
  193. throw cont_without_start(u);
  194. } else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
  195. {
  196. throw no_unicode(u);
  197. } else if (u<=0xDF) // 2 octet sequence
  198. {
  199. ++c;
  200. if(c==end) throw unexpected_end(u);
  201. const uint8_t uu = uint8_t(*c);
  202. if((uu & 0xC0) != 0x80)
  203. {
  204. throw unexpected_end(uu);
  205. }
  206. return ((u & 0x1F) << 6) + (uu & 0x3F);
  207. } else if (u<=0xEF) // 3 octet sequence
  208. {
  209. ++c;
  210. if(c==end) throw unexpected_end(u);
  211. const uint8_t uu = uint8_t(*c);
  212. if((uu & 0xC0) != 0x80)
  213. {
  214. throw unexpected_end(uu);
  215. }
  216. ++c;
  217. if(c==end) throw unexpected_end(uu);
  218. const uint8_t uuu = uint8_t(*c);
  219. if((uuu & 0xC0) != 0x80)
  220. {
  221. throw unexpected_end(uuu);
  222. }
  223. const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
  224. if(ret<0x800) throw overlong_sequence(u, ret);
  225. if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret);
  226. return ret;
  227. } else if (u<=0xF4) // 4 octet sequence
  228. {
  229. ++c;
  230. if(c==end) throw unexpected_end(u);
  231. const uint8_t uu = uint8_t(*c);
  232. if((uu & 0xC0) != 0x80)
  233. {
  234. throw unexpected_end(uu);
  235. }
  236. ++c;
  237. if(c==end) throw unexpected_end(uu);
  238. const uint8_t uuu = uint8_t(*c);
  239. if((uuu & 0xC0) != 0x80)
  240. {
  241. throw unexpected_end(uuu);
  242. }
  243. ++c;
  244. if(c==end) throw unexpected_end(uuu);
  245. const uint8_t uuuu = uint8_t(*c);
  246. if((uuuu & 0xC0) != 0x80)
  247. {
  248. throw unexpected_end(uuuu);
  249. }
  250. const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
  251. if(ret<0x10000) throw overlong_sequence(u, ret);
  252. if(ret>0x10FFFF) throw too_big(u, ret);
  253. return ret;
  254. } else
  255. {
  256. throw no_unicode(u);
  257. }
  258. }
  259. throw unexpected_end(-1);
  260. }
  261. uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
  262. {
  263. while(c<end)
  264. {
  265. const char16_t u = *c;
  266. if(u<0xD800 || u>=0xE000)
  267. {
  268. return u;
  269. }else{
  270. if(u>=0xDC00)
  271. {
  272. throw unexpected_low_surrogate(u);
  273. }
  274. ++c;
  275. if(c==end) throw unexpected_end(u);
  276. const uint16_t low = *c;
  277. if(low < 0xDC00 || low > 0xDFFF)
  278. {
  279. throw missing_low_surrogate(low, u);
  280. }
  281. return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
  282. }
  283. }
  284. throw unexpected_end(-1);
  285. }
  286. void toUtf8(const char32_t c, std::string& ret)
  287. {
  288. if(c<=0x7F)
  289. {
  290. ret += char(c);
  291. }else if(c<=0x7FF)
  292. {
  293. ret += char( 0xC0 + (c>>6) );
  294. ret += char( 0x80 + (c & 63));
  295. }else if(c<=0xFFFF)
  296. {
  297. ret += char( 0xE0 + (c>>12) );
  298. ret += char( 0x80 + ((c>>6) & 63));
  299. ret += char( 0x80 + (c & 63));
  300. }else if(c<=0x10FFFF)
  301. {
  302. ret += char( 0xF0 + (c>>18) );
  303. ret += char( 0x80 + ((c>>12) & 63));
  304. ret += char( 0x80 + ((c>>6) & 63));
  305. ret += char( 0x80 + (c & 63));
  306. }else{
  307. throw too_big(0, c);
  308. }
  309. }
  310. std::string toUtf8(const std::u32string& u32)
  311. {
  312. std::string ret;
  313. for(char32_t c : u32)
  314. {
  315. toUtf8(c, ret);
  316. }
  317. return ret;
  318. }
  319. illegal_utf8::illegal_utf8( string_view s, unsigned position, const std::string& reason)
  320. : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  321. {}
  322. illegal_utf8::illegal_utf8( const std::string& msg )
  323. : std::runtime_error( msg )
  324. {}
  325. void assert_utf8(string_view s)
  326. {
  327. const char* begin = s.data();
  328. const char* const end = s.data() + s.size();
  329. try
  330. {
  331. while(begin<end)
  332. {
  333. parseUtf8(begin, end); // ignore the output
  334. ++begin;
  335. }
  336. }
  337. catch(const utf_exception& e)
  338. {
  339. throw illegal_utf8(s, begin - s.data(), e.reason());
  340. }
  341. }
  342. // creates a NFD string from s
  343. std::u32string fromUtf8_decompose(string_view s)
  344. {
  345. std::u32string u32s;
  346. u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
  347. const char* begin = s.data();
  348. const char* end = s.data() + s.size();
  349. for(; begin<end; ++begin)
  350. {
  351. unsigned u = parseUtf8(begin, end);
  352. u32s += decompose_full(u);
  353. }
  354. canonicalOrdering(u32s); // works inplace.
  355. return u32s;
  356. }
  357. template<class Iter>
  358. bool blocked(Iter L, Iter C)
  359. {
  360. Iter B = L; ++B;
  361. for(;B!=C;++B)
  362. {
  363. if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
  364. return true;
  365. }
  366. return false;
  367. }
  368. template<class Iter>
  369. void combine(std::u32string& nfc, Iter starter, Iter next_starter)
  370. {
  371. Iter c = starter; ++c;
  372. for(;c!=next_starter; ++c)
  373. {
  374. if(!blocked(starter, c))
  375. {
  376. const unsigned starter_u = *starter;
  377. const unsigned c_u = *c;
  378. auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) );
  379. if(q!=NFC_Compose.end())
  380. {
  381. *starter = q->second;
  382. *c = -1;
  383. }
  384. }
  385. }
  386. // now add the remaining/changed characters to the NFC string:
  387. for(Iter c = starter; c!=next_starter; ++c)
  388. {
  389. if( int(*c) >= 0)
  390. {
  391. nfc += *c;
  392. }
  393. }
  394. }
  395. // the nfd string is changed during composing process. So it works on a copy or call with std::move().
  396. std::u32string createNFC(std::u32string nfd)
  397. {
  398. if(nfd.size()<=1)
  399. return nfd;
  400. std::u32string nfc;
  401. nfc.reserve(nfd.size());
  402. auto starter = nfd.begin();
  403. while( starter != nfd.end() )
  404. {
  405. if( canonicalClass(*starter)!=0 )
  406. {
  407. nfc += *starter;
  408. ++starter;
  409. }else{
  410. auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
  411. combine(nfc, starter, next_starter);
  412. starter = next_starter;
  413. }
  414. }
  415. return nfc;
  416. }
  417. IsNFC isNFC_quick_check(string_view s)
  418. {
  419. const char* begin = s.data();
  420. const char* const end = s.data() + s.size();
  421. try
  422. {
  423. unsigned last_cc = 0;
  424. while(begin<end)
  425. {
  426. const uint32_t u = parseUtf8(begin, end);
  427. const unsigned cc = canonicalClass(u);
  428. if( (cc!=0) && (last_cc > cc) )
  429. {
  430. return IsNFC::No;
  431. }
  432. if(NFC_No.count(u)) return IsNFC::No;
  433. if(NFC_Maybe.count(u)) return IsNFC::Maybe;
  434. ++begin;
  435. last_cc = cc;
  436. }
  437. }
  438. catch(const utf_exception& e)
  439. {
  440. throw illegal_utf8(s, begin - s.data(), e.reason());
  441. }
  442. return IsNFC::Yes;
  443. }
  444. bool isNFC(string_view s)
  445. {
  446. switch( isNFC_quick_check(s) )
  447. {
  448. case IsNFC::Yes : return true;
  449. case IsNFC::No : return false;
  450. case IsNFC::Maybe:
  451. {
  452. return s == toNFC(s); // very expensive!
  453. }
  454. }
  455. throw -1; // could never happen, but compiler is too dumb to see this.
  456. }
  457. bool isUtf8(const char* begin, const char* end)
  458. try{
  459. for(; begin<end; ++begin)
  460. {
  461. (void)parseUtf8(begin, end);
  462. }
  463. return true;
  464. }catch(const illegal_utf8&)
  465. {
  466. return false;
  467. }
  468. // s is ''moved'' to the return value if possible so no copy is done here.
  469. std::string toNFC(string_view s)
  470. {
  471. if(isNFC_quick_check(s)==IsNFC::Yes)
  472. return std::string{s};
  473. return toUtf8( createNFC( fromUtf8_decompose(s) ));
  474. }
  475. // used only to initialize the NFC Compose mapping:
  476. std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
  477. {
  478. std::map< std::pair<unsigned, unsigned>, unsigned> m;
  479. for(const auto& decomp : NFC_Decompose)
  480. {
  481. if(decomp.second.second >= 0) // skip singleton decompositions
  482. {
  483. m[ decomp.second ] = decomp.first;
  484. }
  485. }
  486. return m;
  487. }
  488. } // end of namespace pEp