A C++ wrapper for the basic C datatypes defined by the pEpEngine.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

765 lines
16 KiB

  1. // This file is under GNU General Public License 3.0
  2. // see LICENSE.txt
  3. // converts a C++ string into NFC form
  4. #include "nfc.hh"
  5. #include <cstdint>
  6. #include <set>
  7. #include <ostream>
  8. #include <algorithm>
  9. #include "nfc_sets.hh"
  10. #include <pEp/pEp_string.h>
  11. namespace
  12. {
  13. // unicode to hex string
  14. std::string u2h(unsigned u)
  15. {
  16. char buf[16] = {0};
  17. snprintf(buf, 15, "<U+%04X>", u );
  18. return buf;
  19. }
  20. // octet to hex string
  21. std::string o2h(uint8_t octet)
  22. {
  23. char buf[16] = {0};
  24. snprintf(buf, 15, "0x%02hhX", octet);
  25. return buf;
  26. }
  27. // hex string of a 16-bit value
  28. std::string hex16(char16_t u)
  29. {
  30. char buf[16] = {0};
  31. snprintf(buf, 15, "0x%04X", u);
  32. return buf;
  33. }
  34. class utf_exception
  35. {
  36. public:
  37. utf_exception(uint16_t u) : octet(u), value(u) {}
  38. virtual ~utf_exception() = default;
  39. virtual std::string reason() const = 0;
  40. uint8_t octet;
  41. uint16_t value;
  42. };
  43. class cont_without_start : public utf_exception
  44. {
  45. public:
  46. cont_without_start(uint8_t u) : utf_exception(u) {}
  47. std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
  48. };
  49. class overlong_sequence : public utf_exception
  50. {
  51. public:
  52. overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {}
  53. std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
  54. unsigned unicode;
  55. };
  56. class unexpected_end : public utf_exception
  57. {
  58. public:
  59. unexpected_end(uint8_t u) : utf_exception(u) {}
  60. std::string reason() const override { return "Unexpected end of string"; }
  61. };
  62. class surrogate : public utf_exception
  63. {
  64. public:
  65. surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {}
  66. std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
  67. private:
  68. unsigned surr;
  69. };
  70. class no_unicode : public utf_exception
  71. {
  72. public:
  73. explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {}
  74. std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
  75. };
  76. class too_big : public utf_exception
  77. {
  78. public:
  79. explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {}
  80. std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
  81. unsigned unicode;
  82. };
  83. class unexpected_surrogate : public utf_exception
  84. {
  85. public:
  86. explicit unexpected_surrogate(char16_t c) : utf_exception(c) {}
  87. std::string reason() const override { return "Unexpected surogate " + hex16(value); }
  88. };
  89. class missing_low_surrogate : public utf_exception
  90. {
  91. public:
  92. explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {}
  93. std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); }
  94. private:
  95. char16_t surr;
  96. };
  97. std::string escape(std::string_view s)
  98. {
  99. std::string ret; ret.reserve(s.size() + 16 );
  100. for(char c : s)
  101. {
  102. const uint8_t u = c;
  103. if(u>=32 && u<=126)
  104. {
  105. ret += c;
  106. }else{
  107. char buf[16];
  108. snprintf(buf,15, "«%02x»", u );
  109. ret += buf;
  110. }
  111. }
  112. return ret;
  113. }
  114. std::string escape(std::u16string_view s)
  115. {
  116. std::string ret; ret.reserve(s.size() + 16 );
  117. for(char16_t c : s)
  118. {
  119. if(c>=32 && c<=126)
  120. {
  121. ret += char(c);
  122. }else{
  123. char buf[16];
  124. snprintf(buf,15, "«%04x»", c );
  125. ret += buf;
  126. }
  127. }
  128. return ret;
  129. }
  130. // returns the "CanonicalCombinincClass" of the given Unicode codpoint u
  131. unsigned canonicalClass(unsigned u)
  132. {
  133. const auto q = NFC_CombiningClass.find(u);
  134. if(q==NFC_CombiningClass.end())
  135. {
  136. return 0; // not found in map.
  137. }else{
  138. return q->second;
  139. }
  140. }
  141. std::pair<int,int> decompose(unsigned u)
  142. {
  143. const auto q = NFC_Decompose.find(u);
  144. if(q==NFC_Decompose.end())
  145. {
  146. return std::make_pair(-1, -1);
  147. }else{
  148. return q->second;
  149. }
  150. }
  151. std::u32string decompose_full(unsigned u)
  152. {
  153. const std::pair<int,int> d = decompose(u);
  154. if(d.first<0)
  155. {
  156. return std::u32string( 1, char32_t(u) );
  157. }else{
  158. if(d.second<0)
  159. {
  160. return decompose_full(d.first);
  161. }
  162. }
  163. return decompose_full(d.first) + decompose_full(d.second);
  164. }
  165. // according to Unicode Standard, clause D108:
  166. bool isReorderablePair(unsigned a, unsigned b)
  167. {
  168. const unsigned cca = canonicalClass(a);
  169. const unsigned ccb = canonicalClass(b);
  170. return (cca > ccb) && (ccb>0);
  171. }
  172. // Unicode standard requires bubble sort, for stability reasons?
  173. void canonicalOrdering(std::u32string& us)
  174. {
  175. if(us.size()<2)
  176. return;
  177. for(unsigned n=us.size(); n>1; --n)
  178. for(unsigned i=0; i<n-1; ++i)
  179. {
  180. char32_t& a = us[i];
  181. char32_t& b = us[i+1];
  182. if( isReorderablePair(a,b) )
  183. {
  184. std::swap(a,b);
  185. }
  186. }
  187. }
  188. } // end of anonymous namespace
  189. namespace pEp {
  190. std::string escape_utf16(std::u16string_view s)
  191. {
  192. return escape(s);
  193. }
  194. std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
  195. {
  196. switch(is_nfc)
  197. {
  198. case IsNFC::No : return o << "No";
  199. case IsNFC::Maybe : return o << "Maybe";
  200. case IsNFC::Yes : return o << "Yes";
  201. }
  202. throw std::logic_error("Unknown value of IsNFC");
  203. }
  204. uint32_t parseUtf8(const char*& c, const char* end)
  205. {
  206. while(c<end)
  207. {
  208. const uint8_t u = uint8_t(*c);
  209. if (u<=0x7f)
  210. {
  211. return u;
  212. } else if (u<=0xBF)
  213. {
  214. throw cont_without_start(u);
  215. } else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
  216. {
  217. throw no_unicode(u);
  218. } else if (u<=0xDF) // 2 octet sequence
  219. {
  220. ++c;
  221. if(c==end) throw unexpected_end(u);
  222. const uint8_t uu = uint8_t(*c);
  223. if((uu & 0xC0) != 0x80)
  224. {
  225. throw unexpected_end(uu);
  226. }
  227. return ((u & 0x1F) << 6) + (uu & 0x3F);
  228. } else if (u<=0xEF) // 3 octet sequence
  229. {
  230. ++c;
  231. if(c==end) throw unexpected_end(u);
  232. const uint8_t uu = uint8_t(*c);
  233. if((uu & 0xC0) != 0x80)
  234. {
  235. throw unexpected_end(uu);
  236. }
  237. ++c;
  238. if(c==end) throw unexpected_end(uu);
  239. const uint8_t uuu = uint8_t(*c);
  240. if((uuu & 0xC0) != 0x80)
  241. {
  242. throw unexpected_end(uuu);
  243. }
  244. const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
  245. if(ret<0x800) throw overlong_sequence(u, ret);
  246. if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret);
  247. return ret;
  248. } else if (u<=0xF4) // 4 octet sequence
  249. {
  250. ++c;
  251. if(c==end) throw unexpected_end(u);
  252. const uint8_t uu = uint8_t(*c);
  253. if((uu & 0xC0) != 0x80)
  254. {
  255. throw unexpected_end(uu);
  256. }
  257. ++c;
  258. if(c==end) throw unexpected_end(uu);
  259. const uint8_t uuu = uint8_t(*c);
  260. if((uuu & 0xC0) != 0x80)
  261. {
  262. throw unexpected_end(uuu);
  263. }
  264. ++c;
  265. if(c==end) throw unexpected_end(uuu);
  266. const uint8_t uuuu = uint8_t(*c);
  267. if((uuuu & 0xC0) != 0x80)
  268. {
  269. throw unexpected_end(uuuu);
  270. }
  271. const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
  272. if(ret<0x10000) throw overlong_sequence(u, ret);
  273. if(ret>0x10FFFF) throw too_big(u, ret);
  274. return ret;
  275. } else
  276. {
  277. throw no_unicode(u);
  278. }
  279. }
  280. throw unexpected_end(-1);
  281. }
  282. uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
  283. {
  284. while(c<end)
  285. {
  286. const char16_t u = *c;
  287. if(u<0xD800 || u>=0xE000)
  288. {
  289. return u;
  290. }else{
  291. if(u>=0xDC00)
  292. {
  293. throw unexpected_surrogate(u);
  294. }
  295. ++c;
  296. if(c==end) throw unexpected_end(u);
  297. const uint16_t low = *c;
  298. if(low < 0xDC00 || low > 0xDFFF)
  299. {
  300. throw missing_low_surrogate(low, u);
  301. }
  302. return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000;
  303. }
  304. }
  305. throw unexpected_end(-1);
  306. }
  307. template<>
  308. uint32_t UTF<char>::parse(const char*& c, const char* end)
  309. {
  310. return parseUtf8(c,end);
  311. }
  312. template<>
  313. uint32_t UTF<char16_t>::parse(const char16_t*& c, const char16_t* end)
  314. {
  315. return parseUtf16(c,end);
  316. }
  317. template<>
  318. template<class OutIter>
  319. void UTF<char>::generate(const char32_t c, OutIter& out)
  320. {
  321. if(c<=0x7F)
  322. {
  323. *out++ = char(c);
  324. }else if(c<=0x7FF)
  325. {
  326. *out++ = char( 0xC0 + (c>>6) );
  327. *out++ = char( 0x80 + (c & 63));
  328. }else if(c<=0xFFFF)
  329. {
  330. if(c>=0xD800 && c<=0xDFFF)
  331. {
  332. throw unexpected_surrogate(c);
  333. }
  334. *out++ = char( 0xE0 + (c>>12) );
  335. *out++ = char( 0x80 + ((c>>6) & 63));
  336. *out++ = char( 0x80 + (c & 63));
  337. }else if(c<=0x10FFFF)
  338. {
  339. *out++ = char( 0xF0 + (c>>18) );
  340. *out++ = char( 0x80 + ((c>>12) & 63));
  341. *out++ = char( 0x80 + ((c>>6) & 63));
  342. *out++ = char( 0x80 + (c & 63));
  343. }else{
  344. throw too_big(0, c);
  345. }
  346. }
  347. template<>
  348. template<class OutIter>
  349. void UTF<char16_t>::generate(const char32_t c, OutIter& out)
  350. {
  351. if(c <= 0xFFFF)
  352. {
  353. if(c>=0xD800 && c<=0xDFFF)
  354. {
  355. throw unexpected_surrogate(c);
  356. }else{
  357. *out++ = char16_t(c);
  358. }
  359. }else{ // surrogate pair
  360. if(c>0x10FFFF)
  361. {
  362. throw too_big(0, c);
  363. }else{
  364. const uint32_t c_reduced = c - 0x10000;
  365. *out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
  366. *out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
  367. }
  368. }
  369. }
  370. template<class CharT>
  371. std::basic_string<CharT> UTF<CharT>::generate(const std::u32string& u32)
  372. {
  373. std::basic_string<CharT> ret;
  374. auto out = std::back_inserter(ret);
  375. for(char32_t c : u32)
  376. {
  377. generate(c, out);
  378. }
  379. return ret;
  380. }
  381. illegal_utf::illegal_utf( std::string_view s, unsigned position, const std::string& reason)
  382. : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  383. {}
  384. illegal_utf::illegal_utf( std::u16string_view s, unsigned position, const std::string& reason)
  385. : std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  386. {}
  387. illegal_utf::illegal_utf( const std::string& msg )
  388. : std::runtime_error( msg )
  389. {}
  390. void assert_utf8(std::string_view s)
  391. {
  392. const char* begin = s.data();
  393. const char* const end = s.data() + s.size();
  394. try
  395. {
  396. while(begin<end)
  397. {
  398. UTF8::parse(begin, end); // ignore the output
  399. ++begin;
  400. }
  401. }
  402. catch(const utf_exception& e)
  403. {
  404. throw illegal_utf(s, begin - s.data(), e.reason());
  405. }
  406. }
  407. // creates a NFD string from s
  408. template<class CharT>
  409. std::u32string UTF<CharT>::fromUtf_decompose(std::basic_string_view<CharT> s)
  410. {
  411. std::u32string u32s;
  412. u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
  413. const CharT* begin = s.data();
  414. const CharT* end = s.data() + s.size();
  415. for(; begin<end; ++begin)
  416. {
  417. unsigned u = parse(begin, end);
  418. u32s += decompose_full(u);
  419. }
  420. canonicalOrdering(u32s); // works inplace.
  421. return u32s;
  422. }
  423. template<class Iter>
  424. bool blocked(Iter L, Iter C)
  425. {
  426. Iter B = L; ++B;
  427. for(;B!=C;++B)
  428. {
  429. if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
  430. return true;
  431. }
  432. return false;
  433. }
  434. template<class Iter>
  435. void combine(std::u32string& nfc, Iter starter, Iter next_starter)
  436. {
  437. Iter c = starter; ++c;
  438. for(;c!=next_starter; ++c)
  439. {
  440. if(!blocked(starter, c))
  441. {
  442. const unsigned starter_u = *starter;
  443. const unsigned c_u = *c;
  444. auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) );
  445. if(q!=NFC_Compose.end())
  446. {
  447. *starter = q->second;
  448. *c = -1;
  449. }
  450. }
  451. }
  452. // now add the remaining/changed characters to the NFC string:
  453. for(Iter c = starter; c!=next_starter; ++c)
  454. {
  455. if( int(*c) >= 0)
  456. {
  457. nfc += *c;
  458. }
  459. }
  460. }
  461. // the nfd string is changed during composing process. So it works on a copy or call with std::move().
  462. std::u32string createNFC(std::u32string nfd)
  463. {
  464. if(nfd.size()<=1)
  465. return nfd;
  466. std::u32string nfc;
  467. nfc.reserve(nfd.size());
  468. auto starter = nfd.begin();
  469. while( starter != nfd.end() )
  470. {
  471. if( canonicalClass(*starter)!=0 )
  472. {
  473. nfc += *starter;
  474. ++starter;
  475. }else{
  476. auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
  477. combine(nfc, starter, next_starter);
  478. starter = next_starter;
  479. }
  480. }
  481. return nfc;
  482. }
  483. template<class CharT>
  484. IsNFC UTF<CharT>::isNFC_quick_check(std::basic_string_view<CharT> s)
  485. {
  486. const CharT* begin = s.data();
  487. const CharT* const end = s.data() + s.size();
  488. try
  489. {
  490. unsigned last_cc = 0;
  491. while(begin<end)
  492. {
  493. const uint32_t u = parse(begin, end);
  494. const unsigned cc = canonicalClass(u);
  495. if( (cc!=0) && (last_cc > cc) )
  496. {
  497. return IsNFC::No;
  498. }
  499. if(NFC_No.count(u)) return IsNFC::No;
  500. if(NFC_Maybe.count(u)) return IsNFC::Maybe;
  501. ++begin;
  502. last_cc = cc;
  503. }
  504. }
  505. catch(const utf_exception& e)
  506. {
  507. throw illegal_utf(s, begin - s.data(), e.reason());
  508. }
  509. return IsNFC::Yes;
  510. }
  511. template<class CharT>
  512. bool UTF<CharT>::isNFC(std::basic_string_view<CharT> s)
  513. {
  514. switch( isNFC_quick_check(s) )
  515. {
  516. case IsNFC::Yes : return true;
  517. case IsNFC::No : return false;
  518. case IsNFC::Maybe:
  519. {
  520. return s == toNFC(s); // very expensive!
  521. }
  522. }
  523. throw -1; // could never happen, but compiler is too dumb to see this.
  524. }
  525. template<>
  526. bool UTF<char>::isUtf(const char* begin, const char* end)
  527. try{
  528. for(; begin<end; ++begin)
  529. {
  530. (void)parse(begin, end);
  531. }
  532. return true;
  533. }catch(const illegal_utf&)
  534. {
  535. return false;
  536. }
  537. // s is ''moved'' to the return value if possible so no copy is done here.
  538. template<class CharT>
  539. std::basic_string<CharT> UTF<CharT>::toNFC(std::basic_string_view<CharT> s)
  540. {
  541. if(isNFC_quick_check(s)==IsNFC::Yes)
  542. return std::basic_string<CharT>{s};
  543. return generate( createNFC( fromUtf_decompose(s) ));
  544. }
  545. template<>
  546. size_t UTF<char>::utf_length(std::u32string_view s)
  547. {
  548. size_t len = 0;
  549. for(const char32_t c : s)
  550. {
  551. if(c <= 0x7f)
  552. {
  553. len += 1;
  554. }else if(c<=0x7ff)
  555. {
  556. len += 2;
  557. }else if(c<=0xffff)
  558. {
  559. if(c>=0xD800 && c<=0xDFFF)
  560. {
  561. throw unexpected_surrogate(c);
  562. }
  563. len += 3;
  564. }else if(c<=0x10ffff)
  565. {
  566. len += 4;
  567. }else{
  568. throw too_big(0, c);
  569. }
  570. }
  571. return len;
  572. }
  573. template<>
  574. size_t UTF<char16_t>::utf_length(std::u32string_view s)
  575. {
  576. size_t len = 0;
  577. for(const char32_t c : s)
  578. {
  579. if(c <= 0xffff)
  580. {
  581. if(c>=0xD800 && c<=0xDFFF)
  582. {
  583. throw unexpected_surrogate(c);
  584. }
  585. len += 1;
  586. }else if(c<=0x10ffff)
  587. {
  588. len += 2;
  589. }else{
  590. throw too_big(0, c);
  591. }
  592. }
  593. return len;
  594. }
  595. // convenience function to avoid ::strdup(pEp::toNFC<char>(text).c_str());
  596. // and unecessary temporary std::string etc.
  597. char* strdup_NFC(std::string_view s)
  598. {
  599. if(UTF8::isNFC_quick_check(s)==IsNFC::Yes)
  600. return ::new_string(s.data(), s.size());
  601. // implement the hard way more efficient
  602. const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) );
  603. const size_t out_len = UTF8::utf_length(u32);
  604. char* ret = ::new_string(nullptr, out_len );
  605. char* iter{ret};
  606. for(const char32_t c : u32)
  607. {
  608. UTF8::generate(c, iter);
  609. }
  610. if(iter > ret+out_len) // should never happen. ;)
  611. {
  612. throw std::logic_error("internal error: strdup_NFC() exceeded output string size");
  613. }
  614. return ret;
  615. }
  616. pEp_identity *identity_dup_NFC(const ::pEp_identity* value)
  617. {
  618. ::pEp_identity* result = (::pEp_identity*) malloc(sizeof(::pEp_identity));
  619. if (!result)
  620. throw std::bad_alloc();
  621. memcpy(result, value, sizeof(::pEp_identity));
  622. result->address = pEp::strdup_NFC(value->address);
  623. result->fpr = pEp::strdup_NFC(value->fpr);
  624. result->user_id = pEp::strdup_NFC(value->user_id);
  625. result->username = pEp::strdup_NFC(value->username);
  626. return result;
  627. }
  628. ::identity_list* identity_list_dup_NFC(const ::identity_list* value)
  629. {
  630. ::identity_list* result = ::new_identity_list(nullptr);
  631. if (!result)
  632. throw std::bad_alloc();
  633. const ::identity_list* il = value;
  634. ::identity_list* ir = result;
  635. for (; il && il->ident; il = il->next) {
  636. ir = ::identity_list_add(ir, identity_dup_NFC(il->ident));
  637. if (!ir)
  638. throw std::bad_alloc();
  639. }
  640. return result;
  641. }
  642. template class UTF<char>;
  643. template class UTF<char16_t>;
  644. // used only to initialize the NFC Compose mapping:
  645. std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
  646. {
  647. std::map< std::pair<unsigned, unsigned>, unsigned> m;
  648. for(const auto& decomp : NFC_Decompose)
  649. {
  650. if(decomp.second.second >= 0) // skip singleton decompositions
  651. {
  652. m[ decomp.second ] = decomp.first;
  653. }
  654. }
  655. return m;
  656. }
  657. } // end of namespace pEp