A C++ wrapper for the basic C datatypes defined by the pEpEngine.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

646 lines
13 KiB

  1. // This file is under GNU General Public License 3.0
  2. // see LICENSE.txt
  3. // converts a C++ string into NFC form
  4. #include "nfc.hh"
  5. #include <cstdint>
  6. #include <set>
  7. #include <ostream>
  8. #include <algorithm>
  9. #include "nfc_sets.hh"
  10. namespace
  11. {
  12. // unicode to hex string
  13. std::string u2h(unsigned u)
  14. {
  15. char buf[16] = {0};
  16. snprintf(buf, 15, "<U+%04X>", u );
  17. return buf;
  18. }
  19. // octet to hex string
  20. std::string o2h(uint8_t octet)
  21. {
  22. char buf[16] = {0};
  23. snprintf(buf, 15, "0x%02hhX", octet);
  24. return buf;
  25. }
  26. // hex string of a 16-bit value
  27. std::string hex16(char16_t u)
  28. {
  29. char buf[16] = {0};
  30. snprintf(buf, 15, "0x%04X", u);
  31. return buf;
  32. }
  33. class utf_exception
  34. {
  35. public:
  36. utf_exception(uint16_t u) : octet(u), value(u) {}
  37. virtual ~utf_exception() = default;
  38. virtual std::string reason() const = 0;
  39. uint8_t octet;
  40. uint16_t value;
  41. };
  42. class cont_without_start : public utf_exception
  43. {
  44. public:
  45. cont_without_start(uint8_t u) : utf_exception(u) {}
  46. std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
  47. };
  48. class overlong_sequence : public utf_exception
  49. {
  50. public:
  51. overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
  52. std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
  53. unsigned unicode;
  54. };
  55. class unexpected_end : public utf_exception
  56. {
  57. public:
  58. unexpected_end(uint8_t u) : utf_exception(u) {}
  59. std::string reason() const override { return "Unexpected end of string"; }
  60. };
  61. class surrogate : public utf_exception
  62. {
  63. public:
  64. surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
  65. std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
  66. private:
  67. unsigned surr;
  68. };
  69. class no_unicode : public utf_exception
  70. {
  71. public:
  72. explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
  73. std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
  74. };
  75. class too_big : public utf_exception
  76. {
  77. public:
  78. explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
  79. std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
  80. unsigned unicode;
  81. };
  82. class unexpected_surrogate : public utf_exception
  83. {
  84. public:
  85. explicit unexpected_surrogate(char16_t c) : utf_exception(c) {}
  86. std::string reason() const override { return "Unexpected surogate " + hex16(value); }
  87. };
  88. class missing_low_surrogate : public utf_exception
  89. {
  90. public:
  91. explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
  92. std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
  93. private:
  94. char16_t surr;
  95. };
  96. std::string escape(pEp::string_view s)
  97. {
  98. std::string ret; ret.reserve(s.size() + 16 );
  99. for(char c : s)
  100. {
  101. const uint8_t u = c;
  102. if(u>=32 && u<=126)
  103. {
  104. ret += c;
  105. }else{
  106. char buf[16];
  107. snprintf(buf,15, "«%02x»", u );
  108. ret += buf;
  109. }
  110. }
  111. return ret;
  112. }
  113. std::string escape(pEp::u16string_view s)
  114. {
  115. std::string ret; ret.reserve(s.size() + 16 );
  116. for(char16_t c : s)
  117. {
  118. if(c>=32 && c<=126)
  119. {
  120. ret += char(c);
  121. }else{
  122. char buf[16];
  123. snprintf(buf,15, "«%04x»", c );
  124. ret += buf;
  125. }
  126. }
  127. return ret;
  128. }
  129. // returns the "CanonicalCombinincClass" of the given Unicode codpoint u
  130. unsigned canonicalClass(unsigned u)
  131. {
  132. const auto q = NFC_CombiningClass.find(u);
  133. if(q==NFC_CombiningClass.end())
  134. {
  135. return 0; // not found in map.
  136. }else{
  137. return q->second;
  138. }
  139. }
  140. std::pair<int,int> decompose(unsigned u)
  141. {
  142. const auto q = NFC_Decompose.find(u);
  143. if(q==NFC_Decompose.end())
  144. {
  145. return std::make_pair(-1, -1);
  146. }else{
  147. return q->second;
  148. }
  149. }
  150. std::u32string decompose_full(unsigned u)
  151. {
  152. const std::pair<int,int> d = decompose(u);
  153. if(d.first<0)
  154. {
  155. return std::u32string( 1, char32_t(u) );
  156. }else{
  157. if(d.second<0)
  158. {
  159. return decompose_full(d.first);
  160. }
  161. }
  162. return decompose_full(d.first) + decompose_full(d.second);
  163. }
  164. // according to Unicode Standard, clause D108:
  165. bool isReorderablePair(unsigned a, unsigned b)
  166. {
  167. const unsigned cca = canonicalClass(a);
  168. const unsigned ccb = canonicalClass(b);
  169. return (cca > ccb) && (ccb>0);
  170. }
  171. // Unicode standard requires bubble sort, for stability reasons?
  172. void canonicalOrdering(std::u32string& us)
  173. {
  174. if(us.size()<2)
  175. return;
  176. for(unsigned n=us.size(); n>1; --n)
  177. for(unsigned i=0; i<n-1; ++i)
  178. {
  179. char32_t& a = us[i];
  180. char32_t& b = us[i+1];
  181. if( isReorderablePair(a,b) )
  182. {
  183. std::swap(a,b);
  184. }
  185. }
  186. }
  187. } // end of anonymous namespace
  188. namespace pEp {
  189. std::string escape_utf16(u16string_view s)
  190. {
  191. return escape(s);
  192. }
  193. std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
  194. {
  195. switch(is_nfc)
  196. {
  197. case IsNFC::No : return o << "No";
  198. case IsNFC::Maybe : return o << "Maybe";
  199. case IsNFC::Yes : return o << "Yes";
  200. }
  201. throw std::logic_error("Unknown value of IsNFC");
  202. }
  203. uint32_t parseUtf8(const char*& c, const char* end)
  204. {
  205. while(c<end)
  206. {
  207. const uint8_t u = uint8_t(*c);
  208. if (u<=0x7f)
  209. {
  210. return u;
  211. } else if (u<=0xBF)
  212. {
  213. throw cont_without_start(u);
  214. } else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
  215. {
  216. throw no_unicode(u);
  217. } else if (u<=0xDF) // 2 octet sequence
  218. {
  219. ++c;
  220. if(c==end) throw unexpected_end(u);
  221. const uint8_t uu = uint8_t(*c);
  222. if((uu & 0xC0) != 0x80)
  223. {
  224. throw unexpected_end(uu);
  225. }
  226. return ((u & 0x1F) << 6) + (uu & 0x3F);
  227. } else if (u<=0xEF) // 3 octet sequence
  228. {
  229. ++c;
  230. if(c==end) throw unexpected_end(u);
  231. const uint8_t uu = uint8_t(*c);
  232. if((uu & 0xC0) != 0x80)
  233. {
  234. throw unexpected_end(uu);
  235. }
  236. ++c;
  237. if(c==end) throw unexpected_end(uu);
  238. const uint8_t uuu = uint8_t(*c);
  239. if((uuu & 0xC0) != 0x80)
  240. {
  241. throw unexpected_end(uuu);
  242. }
  243. const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
  244. if(ret<0x800) throw overlong_sequence(u, ret);
  245. if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret);
  246. return ret;
  247. } else if (u<=0xF4) // 4 octet sequence
  248. {
  249. ++c;
  250. if(c==end) throw unexpected_end(u);
  251. const uint8_t uu = uint8_t(*c);
  252. if((uu & 0xC0) != 0x80)
  253. {
  254. throw unexpected_end(uu);
  255. }
  256. ++c;
  257. if(c==end) throw unexpected_end(uu);
  258. const uint8_t uuu = uint8_t(*c);
  259. if((uuu & 0xC0) != 0x80)
  260. {
  261. throw unexpected_end(uuu);
  262. }
  263. ++c;
  264. if(c==end) throw unexpected_end(uuu);
  265. const uint8_t uuuu = uint8_t(*c);
  266. if((uuuu & 0xC0) != 0x80)
  267. {
  268. throw unexpected_end(uuuu);
  269. }
  270. const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
  271. if(ret<0x10000) throw overlong_sequence(u, ret);
  272. if(ret>0x10FFFF) throw too_big(u, ret);
  273. return ret;
  274. } else
  275. {
  276. throw no_unicode(u);
  277. }
  278. }
  279. throw unexpected_end(-1);
  280. }
  281. uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
  282. {
  283. while(c<end)
  284. {
  285. const char16_t u = *c;
  286. if(u<0xD800 || u>=0xE000)
  287. {
  288. return u;
  289. }else{
  290. if(u>=0xDC00)
  291. {
  292. throw unexpected_surrogate(u);
  293. }
  294. ++c;
  295. if(c==end) throw unexpected_end(u);
  296. const uint16_t low = *c;
  297. if(low < 0xDC00 || low > 0xDFFF)
  298. {
  299. throw missing_low_surrogate(low, u);
  300. }
  301. return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
  302. }
  303. }
  304. throw unexpected_end(-1);
  305. }
  306. template<class CharT>
  307. uint32_t parseUtf(const CharT*& c, const CharT* end);
  308. template<>
  309. inline
  310. uint32_t parseUtf<char>(const char*& c, const char* end)
  311. {
  312. return parseUtf8(c,end);
  313. }
  314. template<>
  315. inline
  316. uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end)
  317. {
  318. return parseUtf16(c,end);
  319. }
  320. template<>
  321. void toUtf<char>(const char32_t c, std::string& ret)
  322. {
  323. if(c<=0x7F)
  324. {
  325. ret += char(c);
  326. }else if(c<=0x7FF)
  327. {
  328. ret += char( 0xC0 + (c>>6) );
  329. ret += char( 0x80 + (c & 63));
  330. }else if(c<=0xFFFF)
  331. {
  332. ret += char( 0xE0 + (c>>12) );
  333. ret += char( 0x80 + ((c>>6) & 63));
  334. ret += char( 0x80 + (c & 63));
  335. }else if(c<=0x10FFFF)
  336. {
  337. ret += char( 0xF0 + (c>>18) );
  338. ret += char( 0x80 + ((c>>12) & 63));
  339. ret += char( 0x80 + ((c>>6) & 63));
  340. ret += char( 0x80 + (c & 63));
  341. }else{
  342. throw too_big(0, c);
  343. }
  344. }
  345. template<>
  346. void toUtf<char16_t>(const char32_t c, std::u16string& ret)
  347. {
  348. if(c <= 0xFFFF)
  349. {
  350. if(c>=0xD800 && c<=0xDFFF)
  351. {
  352. throw unexpected_surrogate(c);
  353. }else{
  354. ret += char16_t(c);
  355. }
  356. }else{ // surrogate pair
  357. if(c>0x10FFFF)
  358. {
  359. throw too_big(0, c);
  360. }else{
  361. const uint32_t c_reduced = c - 0x10000;
  362. ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
  363. ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
  364. }
  365. }
  366. }
  367. template<class CharT>
  368. std::basic_string<CharT> toUtf(const std::u32string& u32)
  369. {
  370. std::basic_string<CharT> ret;
  371. for(char32_t c : u32)
  372. {
  373. toUtf<CharT>(c, ret);
  374. }
  375. return ret;
  376. }
  377. illegal_utf::illegal_utf( string_view s, unsigned position, const std::string& reason)
  378. : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  379. {}
  380. illegal_utf::illegal_utf( u16string_view s, unsigned position, const std::string& reason)
  381. : std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  382. {}
  383. illegal_utf::illegal_utf( const std::string& msg )
  384. : std::runtime_error( msg )
  385. {}
  386. void assert_utf8(string_view s)
  387. {
  388. const char* begin = s.data();
  389. const char* const end = s.data() + s.size();
  390. try
  391. {
  392. while(begin<end)
  393. {
  394. parseUtf8(begin, end); // ignore the output
  395. ++begin;
  396. }
  397. }
  398. catch(const utf_exception& e)
  399. {
  400. throw illegal_utf(s, begin - s.data(), e.reason());
  401. }
  402. }
  403. // creates a NFD string from s
  404. template<class CharT>
  405. std::u32string fromUtf_decompose(basic_string_view<CharT> s)
  406. {
  407. std::u32string u32s;
  408. u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
  409. const CharT* begin = s.data();
  410. const CharT* end = s.data() + s.size();
  411. for(; begin<end; ++begin)
  412. {
  413. unsigned u = parseUtf(begin, end);
  414. u32s += decompose_full(u);
  415. }
  416. canonicalOrdering(u32s); // works inplace.
  417. return u32s;
  418. }
  419. template<class Iter>
  420. bool blocked(Iter L, Iter C)
  421. {
  422. Iter B = L; ++B;
  423. for(;B!=C;++B)
  424. {
  425. if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
  426. return true;
  427. }
  428. return false;
  429. }
  430. template<class Iter>
  431. void combine(std::u32string& nfc, Iter starter, Iter next_starter)
  432. {
  433. Iter c = starter; ++c;
  434. for(;c!=next_starter; ++c)
  435. {
  436. if(!blocked(starter, c))
  437. {
  438. const unsigned starter_u = *starter;
  439. const unsigned c_u = *c;
  440. auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) );
  441. if(q!=NFC_Compose.end())
  442. {
  443. *starter = q->second;
  444. *c = -1;
  445. }
  446. }
  447. }
  448. // now add the remaining/changed characters to the NFC string:
  449. for(Iter c = starter; c!=next_starter; ++c)
  450. {
  451. if( int(*c) >= 0)
  452. {
  453. nfc += *c;
  454. }
  455. }
  456. }
  457. // the nfd string is changed during composing process. So it works on a copy or call with std::move().
  458. std::u32string createNFC(std::u32string nfd)
  459. {
  460. if(nfd.size()<=1)
  461. return nfd;
  462. std::u32string nfc;
  463. nfc.reserve(nfd.size());
  464. auto starter = nfd.begin();
  465. while( starter != nfd.end() )
  466. {
  467. if( canonicalClass(*starter)!=0 )
  468. {
  469. nfc += *starter;
  470. ++starter;
  471. }else{
  472. auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
  473. combine(nfc, starter, next_starter);
  474. starter = next_starter;
  475. }
  476. }
  477. return nfc;
  478. }
  479. template<class CharT>
  480. IsNFC isNFC_quick_check(basic_string_view<CharT> s)
  481. {
  482. const CharT* begin = s.data();
  483. const CharT* const end = s.data() + s.size();
  484. try
  485. {
  486. unsigned last_cc = 0;
  487. while(begin<end)
  488. {
  489. const uint32_t u = parseUtf(begin, end);
  490. const unsigned cc = canonicalClass(u);
  491. if( (cc!=0) && (last_cc > cc) )
  492. {
  493. return IsNFC::No;
  494. }
  495. if(NFC_No.count(u)) return IsNFC::No;
  496. if(NFC_Maybe.count(u)) return IsNFC::Maybe;
  497. ++begin;
  498. last_cc = cc;
  499. }
  500. }
  501. catch(const utf_exception& e)
  502. {
  503. throw illegal_utf(s, begin - s.data(), e.reason());
  504. }
  505. return IsNFC::Yes;
  506. }
  507. template<class CharT>
  508. bool isNFC(basic_string_view<CharT> s)
  509. {
  510. switch( isNFC_quick_check(s) )
  511. {
  512. case IsNFC::Yes : return true;
  513. case IsNFC::No : return false;
  514. case IsNFC::Maybe:
  515. {
  516. return s == toNFC(s); // very expensive!
  517. }
  518. }
  519. throw -1; // could never happen, but compiler is too dumb to see this.
  520. }
  521. template bool isNFC<char>(string_view);
  522. template bool isNFC<char16_t>(u16string_view);
  523. // should be unecessary, but... well...
  524. template std::string toNFC<char>(string_view);
  525. template std::u16string toNFC<char16_t>(u16string_view);
  526. bool isUtf8(const char* begin, const char* end)
  527. try{
  528. for(; begin<end; ++begin)
  529. {
  530. (void)parseUtf8(begin, end);
  531. }
  532. return true;
  533. }catch(const illegal_utf&)
  534. {
  535. return false;
  536. }
  537. // s is ''moved'' to the return value if possible so no copy is done here.
  538. template<class CharT>
  539. std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
  540. {
  541. if(isNFC_quick_check(s)==IsNFC::Yes)
  542. return std::basic_string<CharT>{s};
  543. return toUtf<CharT>( createNFC( fromUtf_decompose(s) ));
  544. }
  545. // used only to initialize the NFC Compose mapping:
  546. std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
  547. {
  548. std::map< std::pair<unsigned, unsigned>, unsigned> m;
  549. for(const auto& decomp : NFC_Decompose)
  550. {
  551. if(decomp.second.second >= 0) // skip singleton decompositions
  552. {
  553. m[ decomp.second ] = decomp.first;
  554. }
  555. }
  556. return m;
  557. }
  558. } // end of namespace pEp