A C++ wrapper for the basic C datatypes defined by the pEpEngine.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

505 lines
10 KiB

  1. // This file is under GNU General Public License 3.0
  2. // see LICENSE.txt
  3. // converts a C++ string into NFC form
  4. #include "nfc.hh"
  5. #include <cstdint>
  6. #include <set>
  7. #include <ostream>
  8. #include <algorithm>
  9. #include "nfc_sets.hh"
  10. namespace
  11. {
  12. // unicode to hex string
  13. std::string u2h(unsigned u)
  14. {
  15. char buf[16] = {0};
  16. snprintf(buf, 15, "<U+%04X>", u );
  17. return buf;
  18. }
  19. // octet to hex string
  20. std::string o2h(uint8_t octet)
  21. {
  22. char buf[16] = {0};
  23. snprintf(buf, 15, "0x%02hhX", octet);
  24. return buf;
  25. }
  26. class utf8_exception
  27. {
  28. public:
  29. utf8_exception(uint8_t u) : octet(u) {}
  30. virtual ~utf8_exception() = default;
  31. virtual std::string reason() const = 0;
  32. uint8_t octet;
  33. };
  34. class cont_without_start : public utf8_exception
  35. {
  36. public:
  37. cont_without_start(uint8_t u) : utf8_exception(u) {}
  38. std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; }
  39. };
  40. class overlong_sequence : public utf8_exception
  41. {
  42. public:
  43. overlong_sequence(uint8_t octet, unsigned u) : utf8_exception(octet), unicode(u) {}
  44. std::string reason() const override { return "Overlong sequence for " + u2h(unicode); }
  45. unsigned unicode;
  46. };
  47. class unexpected_end : public utf8_exception
  48. {
  49. public:
  50. unexpected_end(uint8_t u) : utf8_exception(u) {}
  51. std::string reason() const override { return "Unexpected end of string"; }
  52. };
  53. class surrogate : public utf8_exception
  54. {
  55. public:
  56. surrogate(uint8_t u, unsigned s) : utf8_exception(u), surr(s) {}
  57. std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; }
  58. private:
  59. unsigned surr;
  60. };
  61. class no_unicode : public utf8_exception
  62. {
  63. public:
  64. explicit no_unicode(uint8_t _octet) : utf8_exception(_octet) {}
  65. std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; }
  66. };
  67. class too_big : public utf8_exception
  68. {
  69. public:
  70. explicit too_big(uint8_t _octet, unsigned u) : utf8_exception(_octet), unicode(u) {}
  71. std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; }
  72. unsigned unicode;
  73. };
  74. std::string escape(sv s)
  75. {
  76. std::string ret; ret.reserve(s.size() + 16 );
  77. for(char c : s)
  78. {
  79. const uint8_t u = c;
  80. if(u>=32 && u<=126)
  81. {
  82. ret += c;
  83. }else{
  84. char buf[16];
  85. snprintf(buf,15, "«%02x»", u );
  86. ret += buf;
  87. }
  88. }
  89. return ret;
  90. }
  91. // returns the "CanonicalCombinincClass" of the given Unicode codpoint u
  92. unsigned canonicalClass(unsigned u)
  93. {
  94. const auto q = NFC_CombiningClass.find(u);
  95. if(q==NFC_CombiningClass.end())
  96. {
  97. return 0; // not found in map.
  98. }else{
  99. return q->second;
  100. }
  101. }
  102. std::pair<int,int> decompose(unsigned u)
  103. {
  104. const auto q = NFC_Decompose.find(u);
  105. if(q==NFC_Decompose.end())
  106. {
  107. return std::make_pair(-1, -1);
  108. }else{
  109. return q->second;
  110. }
  111. }
  112. std::u32string decompose_full(unsigned u)
  113. {
  114. const std::pair<int,int> d = decompose(u);
  115. if(d.first<0)
  116. {
  117. return std::u32string( 1, char32_t(u) );
  118. }else{
  119. if(d.second<0)
  120. {
  121. return decompose_full(d.first);
  122. }
  123. }
  124. return decompose_full(d.first) + decompose_full(d.second);
  125. }
  126. // according to Unicode Standard, clause D108:
  127. bool isReorderablePair(unsigned a, unsigned b)
  128. {
  129. const unsigned cca = canonicalClass(a);
  130. const unsigned ccb = canonicalClass(b);
  131. return (cca > ccb) && (ccb>0);
  132. }
  133. // Unicode standard requires bubble sort, for stability reasons?
  134. void canonicalOrdering(std::u32string& us)
  135. {
  136. if(us.size()<2)
  137. return;
  138. for(unsigned n=us.size(); n>1; --n)
  139. for(unsigned i=0; i<n-1; ++i)
  140. {
  141. char32_t& a = us[i];
  142. char32_t& b = us[i+1];
  143. if( isReorderablePair(a,b) )
  144. {
  145. std::swap(a,b);
  146. }
  147. }
  148. }
  149. } // end of anonymous namespace
  150. std::ostream& operator<<(std::ostream& o, IsNFC is_nfc)
  151. {
  152. switch(is_nfc)
  153. {
  154. case IsNFC::No : return o << "No";
  155. case IsNFC::Maybe : return o << "Maybe";
  156. case IsNFC::Yes : return o << "Yes";
  157. }
  158. throw std::logic_error("Unknown value of IsNFC");
  159. }
  160. uint32_t parseUtf8(const char*& c, const char* end)
  161. {
  162. while(c<end)
  163. {
  164. const uint8_t u = uint8_t(*c);
  165. if (u<=0x7f)
  166. {
  167. return u;
  168. } else if (u<=0xBF)
  169. {
  170. throw cont_without_start(u);
  171. } else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
  172. {
  173. throw no_unicode(u);
  174. } else if (u<=0xDF) // 2 octet sequence
  175. {
  176. ++c;
  177. if(c==end) throw unexpected_end(u);
  178. const uint8_t uu = uint8_t(*c);
  179. if((uu & 0xC0) != 0x80)
  180. {
  181. throw unexpected_end(uu);
  182. }
  183. return ((u & 0x1F) << 6) + (uu & 0x3F);
  184. } else if (u<=0xEF) // 3 octet sequence
  185. {
  186. ++c;
  187. if(c==end) throw unexpected_end(u);
  188. const uint8_t uu = uint8_t(*c);
  189. if((uu & 0xC0) != 0x80)
  190. {
  191. throw unexpected_end(uu);
  192. }
  193. ++c;
  194. if(c==end) throw unexpected_end(uu);
  195. const uint8_t uuu = uint8_t(*c);
  196. if((uuu & 0xC0) != 0x80)
  197. {
  198. throw unexpected_end(uuu);
  199. }
  200. const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F);
  201. if(ret<0x800) throw overlong_sequence(u, ret);
  202. if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret);
  203. return ret;
  204. } else if (u<=0xF4) // 4 octet sequence
  205. {
  206. ++c;
  207. if(c==end) throw unexpected_end(u);
  208. const uint8_t uu = uint8_t(*c);
  209. if((uu & 0xC0) != 0x80)
  210. {
  211. throw unexpected_end(uu);
  212. }
  213. ++c;
  214. if(c==end) throw unexpected_end(uu);
  215. const uint8_t uuu = uint8_t(*c);
  216. if((uuu & 0xC0) != 0x80)
  217. {
  218. throw unexpected_end(uuu);
  219. }
  220. ++c;
  221. if(c==end) throw unexpected_end(uuu);
  222. const uint8_t uuuu = uint8_t(*c);
  223. if((uuuu & 0xC0) != 0x80)
  224. {
  225. throw unexpected_end(uuuu);
  226. }
  227. const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F);
  228. if(ret<0x10000) throw overlong_sequence(u, ret);
  229. if(ret>0x10FFFF) throw too_big(u, ret);
  230. return ret;
  231. } else
  232. {
  233. throw no_unicode(u);
  234. }
  235. }
  236. throw unexpected_end(-1);
  237. }
  238. void toUtf8(const char32_t c, std::string& ret)
  239. {
  240. if(c<=0x7F)
  241. {
  242. ret += char(c);
  243. }else if(c<=0x7FF)
  244. {
  245. ret += char( 0xC0 + (c>>6) );
  246. ret += char( 0x80 + (c & 63));
  247. }else if(c<=0xFFFF)
  248. {
  249. ret += char( 0xE0 + (c>>12) );
  250. ret += char( 0x80 + ((c>>6) & 63));
  251. ret += char( 0x80 + (c & 63));
  252. }else if(c<=0x10FFFF)
  253. {
  254. ret += char( 0xF0 + (c>>18) );
  255. ret += char( 0x80 + ((c>>12) & 63));
  256. ret += char( 0x80 + ((c>>6) & 63));
  257. ret += char( 0x80 + (c & 63));
  258. }else{
  259. throw too_big(0, c);
  260. }
  261. }
  262. std::string toUtf8(const std::u32string& u32)
  263. {
  264. std::string ret;
  265. for(char32_t c : u32)
  266. {
  267. toUtf8(c, ret);
  268. }
  269. return ret;
  270. }
  271. illegal_utf8::illegal_utf8( sv s, unsigned position, const std::string& reason)
  272. : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
  273. {}
  274. illegal_utf8::illegal_utf8( const std::string& msg )
  275. : std::runtime_error( msg )
  276. {}
  277. void assert_utf8(sv s)
  278. {
  279. const char* begin = s.data();
  280. const char* const end = s.data() + s.size();
  281. try
  282. {
  283. while(begin<end)
  284. {
  285. parseUtf8(begin, end); // ignore the output
  286. ++begin;
  287. }
  288. }
  289. catch(const utf8_exception& e)
  290. {
  291. throw illegal_utf8(s, begin - s.data(), e.reason());
  292. }
  293. }
  294. // creates a NFD string from s
  295. std::u32string fromUtf8_decompose(sv s)
  296. {
  297. std::u32string u32s;
  298. u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
  299. const char* begin = s.data();
  300. const char* end = s.data() + s.size();
  301. for(; begin<end; ++begin)
  302. {
  303. unsigned u = parseUtf8(begin, end);
  304. u32s += decompose_full(u);
  305. }
  306. canonicalOrdering(u32s); // works inplace.
  307. return u32s;
  308. }
  309. template<class Iter>
  310. bool blocked(Iter L, Iter C)
  311. {
  312. Iter B = L; ++B;
  313. for(;B!=C;++B)
  314. {
  315. if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C))
  316. return true;
  317. }
  318. return false;
  319. }
  320. template<class Iter>
  321. void combine(std::u32string& nfc, Iter starter, Iter next_starter)
  322. {
  323. Iter c = starter; ++c;
  324. for(;c!=next_starter; ++c)
  325. {
  326. if(!blocked(starter, c))
  327. {
  328. const unsigned starter_u = *starter;
  329. const unsigned c_u = *c;
  330. auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) );
  331. if(q!=NFC_Compose.end())
  332. {
  333. *starter = q->second;
  334. *c = -1;
  335. }
  336. }
  337. }
  338. // now add the remaining/changed characters to the NFC string:
  339. for(Iter c = starter; c!=next_starter; ++c)
  340. {
  341. if( int(*c) >= 0)
  342. {
  343. nfc += *c;
  344. }
  345. }
  346. }
  347. // the nfd string is changed during composing process. So it works on a copy or call with std::move().
  348. std::u32string createNFC(std::u32string nfd)
  349. {
  350. if(nfd.size()<=1)
  351. return nfd;
  352. std::u32string nfc;
  353. nfc.reserve(nfd.size());
  354. auto starter = nfd.begin();
  355. while( starter != nfd.end() )
  356. {
  357. if( canonicalClass(*starter)!=0 )
  358. {
  359. nfc += *starter;
  360. ++starter;
  361. }else{
  362. auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} );
  363. combine(nfc, starter, next_starter);
  364. starter = next_starter;
  365. }
  366. }
  367. return nfc;
  368. }
  369. IsNFC isNFC_quick_check(sv s)
  370. {
  371. const char* begin = s.data();
  372. const char* const end = s.data() + s.size();
  373. try
  374. {
  375. unsigned last_cc = 0;
  376. while(begin<end)
  377. {
  378. const uint32_t u = parseUtf8(begin, end);
  379. const unsigned cc = canonicalClass(u);
  380. if( (cc!=0) && (last_cc > cc) )
  381. {
  382. return IsNFC::No;
  383. }
  384. if(NFC_No.count(u)) return IsNFC::No;
  385. if(NFC_Maybe.count(u)) return IsNFC::Maybe;
  386. ++begin;
  387. last_cc = cc;
  388. }
  389. }
  390. catch(const utf8_exception& e)
  391. {
  392. throw illegal_utf8(s, begin - s.data(), e.reason());
  393. }
  394. return IsNFC::Yes;
  395. }
  396. bool isNFC(sv s)
  397. {
  398. switch( isNFC_quick_check(s) )
  399. {
  400. case IsNFC::Yes : return true;
  401. case IsNFC::No : return false;
  402. case IsNFC::Maybe:
  403. {
  404. return s == toNFC(s); // very expensive!
  405. }
  406. }
  407. throw -1; // could never happen, but compiler is too dumb to see this.
  408. }
  409. bool isUtf8(const char* begin, const char* end)
  410. try{
  411. for(; begin<end; ++begin)
  412. {
  413. (void)parseUtf8(begin, end);
  414. }
  415. return true;
  416. }catch(const illegal_utf8&)
  417. {
  418. return false;
  419. }
  420. // s is ''moved'' to the return value if possible so no copy is done here.
  421. std::string toNFC(sv s)
  422. {
  423. if(isNFC_quick_check(s)==IsNFC::Yes)
  424. return std::string{s};
  425. return toUtf8( createNFC( fromUtf8_decompose(s) ));
  426. }
  427. // used only to initialize the NFC Compose mapping:
  428. std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose()
  429. {
  430. std::map< std::pair<unsigned, unsigned>, unsigned> m;
  431. for(const auto& decomp : NFC_Decompose)
  432. {
  433. if(decomp.second.second >= 0) // skip singleton decompositions
  434. {
  435. m[ decomp.second ] = decomp.first;
  436. }
  437. }
  438. return m;
  439. }