diff --git a/src/Makefile b/src/Makefile index 911e533..e7db948 100644 --- a/src/Makefile +++ b/src/Makefile @@ -18,7 +18,7 @@ unittests: unittest_mime.o unittest_nfc.o unittest_timestamp.o \ unittest_stringcase.o unittest_toutf8.o unittest_address.o \ unittest_rule.o unittest_subject.o \ gtest-all.o gtest_main.o libpEpMIME.a - ${CXX} -L${HOME}/local/lib/ -o $@ $^ -lpEpAdapter -lpEpEngine -lpthread + ${CXX} -L${HOME}/local/lib/ -o $@ $^ -lpEpAdapter -lpEpEngine -lpthread -liconv gtest-all.o: $(GTEST_DIR)/src/gtest-all.cc $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -isystem $(GTEST_DIR)/include -o $@ -c $< diff --git a/src/to_utf8.cc b/src/to_utf8.cc index f7de5c0..f179d30 100644 --- a/src/to_utf8.cc +++ b/src/to_utf8.cc @@ -3,6 +3,8 @@ #include "nfc.hh" // for toUtf8() of single codepoints. :-) #include #include +#include +#include namespace { @@ -58,23 +60,73 @@ namespace } return ret; } -} -std::string to_utf8( const std::string& charset, const std::string& s) + +static const size_t IconvBufSize = 64; + +std::string to_utf8_iconv(const std::string& charset, const std::string& s) { - switch( case_hash(charset) ) + iconv_t ict = iconv_open("UTF-8", charset.c_str()); + if(ict == (iconv_t)-1) { - case "UTF-8"_case : - case "UTF8"_case : return s; - case "ISO-8859-1"_case: return from_latin1(s); + if(errno==EINVAL) + { + throw std::runtime_error("Cannot convert from charset \"" + charset + "\" to UTF-8."); + }else{ + throw std::runtime_error(std::string("Internal error: ") + strerror(errno) ); + } } - // try again with to_upper; + + // be exception-safe from here on: + auto ict_wrapper = std::unique_ptr( ict, &iconv_close); + + std::string ret; + ret.reserve(s.size()); + + char buffer[ IconvBufSize ]; + char* in_p = const_cast(s.c_str()); // iconv sucks. + size_t in_len = s.size(); + + while(in_len) + { + char* out_p = buffer; + size_t out_len = IconvBufSize; + const size_t r = iconv(ict, &in_p, &in_len, &out_p, &out_len); + if(r==static_cast(-1)) + { + if(errno == E2BIG) + { + // ignore + }else{ + // skip octet + ++in_p; + --in_len; + } + } + ret.append(buffer, buffer + IconvBufSize - out_len); + } + + return ret; +} + + +} // end of anonymous namespace + + +std::string to_utf8( const std::string& charset, const std::string& s) +{ std::string charset_upper{charset}; boost::algorithm::to_upper(charset_upper); - if(charset_upper == charset) + + switch( lcase_hash(charset_upper) ) { - throw std::runtime_error("Unknown charset \"" + charset + "\""); + case "UTF-8"_lcase : + case "UTF8"_lcase : return s; + case "CP1252"_lcase: + case "CP_1252"_lcase: + case "ISO-8859-1"_lcase: return from_latin1(s); } - return to_utf8( charset_upper, s); + // all other charsets: let's do that by libiconv. :-/ + return to_utf8_iconv(charset, s); } diff --git a/src/unittest_toutf8.cc b/src/unittest_toutf8.cc index de263d7..ac50264 100644 --- a/src/unittest_toutf8.cc +++ b/src/unittest_toutf8.cc @@ -54,3 +54,15 @@ TEST( ToUtf8, Latin1 ) EXPECT_EQ( to_utf8("ISO-8859-1", "\x84\xdc" "bergr\xf6\xdf" "en\xe4" "nderung\x93: 10\x80!"), "„Übergrößenänderung“: 10€!" ); EXPECT_EQ( to_utf8("UTF-8", "„Übergrößenänderung“: 10€!"), "„Übergrößenänderung“: 10€!" ); } + +TEST( ToUtf8, Latin9 ) +{ + EXPECT_EQ( to_utf8("ISO-8859-15", "\xdc" "bergr\xf6\xdf" "en\xe4nderung: 10\xa4!"), "Übergrößenänderung: 10\xe2\x82\xac!" ); + std::string latin9, utf8; + for(unsigned u=0; u<277; ++u) + { + latin9 += "\xb0\xa4\xbe"; // degree, euro, capital Y with diaeresis: °€Ÿ + utf8 += "\xc2\xb0" "\xe2\x82\xac" "\xc5\xb8"; + EXPECT_EQ( to_utf8("ISO-8859-15", latin9), utf8 ); + } +}