Browse Source

add support vor other charsets via iconv(), add tests for conversion of ISO-8859-15 strings.

afl-fuzzing
Roker 3 years ago
parent
commit
62a5079244
3 changed files with 75 additions and 11 deletions
  1. +1
    -1
      src/Makefile
  2. +62
    -10
      src/to_utf8.cc
  3. +12
    -0
      src/unittest_toutf8.cc

+ 1
- 1
src/Makefile View File

@ -18,7 +18,7 @@ unittests: unittest_mime.o unittest_nfc.o unittest_timestamp.o \
unittest_stringcase.o unittest_toutf8.o unittest_address.o \
unittest_rule.o unittest_subject.o \
gtest-all.o gtest_main.o libpEpMIME.a
${CXX} -L${HOME}/local/lib/ -o $@ $^ -lpEpAdapter -lpEpEngine -lpthread
${CXX} -L${HOME}/local/lib/ -o $@ $^ -lpEpAdapter -lpEpEngine -lpthread -liconv
gtest-all.o: $(GTEST_DIR)/src/gtest-all.cc
$(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -isystem $(GTEST_DIR)/include -o $@ -c $<


+ 62
- 10
src/to_utf8.cc View File

@ -3,6 +3,8 @@
#include "nfc.hh" // for toUtf8() of single codepoints. :-)
#include <algorithm>
#include <boost/algorithm/string/case_conv.hpp>
#include <iconv.h>
#include <memory>
namespace
{
@ -58,23 +60,73 @@ namespace
}
return ret;
}
}
std::string to_utf8( const std::string& charset, const std::string& s)
static const size_t IconvBufSize = 64;
std::string to_utf8_iconv(const std::string& charset, const std::string& s)
{
switch( case_hash(charset) )
iconv_t ict = iconv_open("UTF-8", charset.c_str());
if(ict == (iconv_t)-1)
{
case "UTF-8"_case :
case "UTF8"_case : return s;
case "ISO-8859-1"_case: return from_latin1(s);
if(errno==EINVAL)
{
throw std::runtime_error("Cannot convert from charset \"" + charset + "\" to UTF-8.");
}else{
throw std::runtime_error(std::string("Internal error: ") + strerror(errno) );
}
}
// try again with to_upper;
// be exception-safe from here on:
auto ict_wrapper = std::unique_ptr<void, decltype(&iconv_close)>( ict, &iconv_close);
std::string ret;
ret.reserve(s.size());
char buffer[ IconvBufSize ];
char* in_p = const_cast<char*>(s.c_str()); // iconv sucks.
size_t in_len = s.size();
while(in_len)
{
char* out_p = buffer;
size_t out_len = IconvBufSize;
const size_t r = iconv(ict, &in_p, &in_len, &out_p, &out_len);
if(r==static_cast<size_t>(-1))
{
if(errno == E2BIG)
{
// ignore
}else{
// skip octet
++in_p;
--in_len;
}
}
ret.append(buffer, buffer + IconvBufSize - out_len);
}
return ret;
}
} // end of anonymous namespace
std::string to_utf8( const std::string& charset, const std::string& s)
{
std::string charset_upper{charset};
boost::algorithm::to_upper(charset_upper);
if(charset_upper == charset)
switch( lcase_hash(charset_upper) )
{
throw std::runtime_error("Unknown charset \"" + charset + "\"");
case "UTF-8"_lcase :
case "UTF8"_lcase : return s;
case "CP1252"_lcase:
case "CP_1252"_lcase:
case "ISO-8859-1"_lcase: return from_latin1(s);
}
return to_utf8( charset_upper, s);
// all other charsets: let's do that by libiconv. :-/
return to_utf8_iconv(charset, s);
}

+ 12
- 0
src/unittest_toutf8.cc View File

@ -54,3 +54,15 @@ TEST( ToUtf8, Latin1 )
EXPECT_EQ( to_utf8("ISO-8859-1", "\x84\xdc" "bergr\xf6\xdf" "en\xe4" "nderung\x93: 10\x80!"), "„Übergrößenänderung“: 10€!" );
EXPECT_EQ( to_utf8("UTF-8", "„Übergrößenänderung“: 10€!"), "„Übergrößenänderung“: 10€!" );
}
TEST( ToUtf8, Latin9 )
{
EXPECT_EQ( to_utf8("ISO-8859-15", "\xdc" "bergr\xf6\xdf" "en\xe4nderung: 10\xa4!"), "Übergrößenänderung: 10\xe2\x82\xac!" );
std::string latin9, utf8;
for(unsigned u=0; u<277; ++u)
{
latin9 += "\xb0\xa4\xbe"; // degree, euro, capital Y with diaeresis: °€Ÿ
utf8 += "\xc2\xb0" "\xe2\x82\xac" "\xc5\xb8";
EXPECT_EQ( to_utf8("ISO-8859-15", latin9), utf8 );
}
}

Loading…
Cancel
Save