From dd86be33352f991733184f6487cb32eefa76e4e0 Mon Sep 17 00:00:00 2001 From: Mat Sutcliffe Date: Fri, 26 Jul 2019 21:10:11 +0100 Subject: [PATCH] String simplification (accent removal) using Unicode decomposition --- src/blackmisc/stringutils.cpp | 20 +++++++++++++++++++ src/blackmisc/stringutils.h | 3 +++ .../teststringutils/teststringutils.cpp | 9 +++++++++ 3 files changed, 32 insertions(+) diff --git a/src/blackmisc/stringutils.cpp b/src/blackmisc/stringutils.cpp index edeab2b69..0224ce4ed 100644 --- a/src/blackmisc/stringutils.cpp +++ b/src/blackmisc/stringutils.cpp @@ -243,6 +243,26 @@ namespace BlackMisc return output; } + QString simplifyByDecomposition(const QString &s) + { + QString result; + for (const QChar c : s) + { + if (c.decompositionTag() == QChar::NoDecomposition) + { + result.push_back(c); + } + else + { + for (const QChar dc : c.decomposition()) + { + if (!dc.isMark()) { result.push_back(dc); } + } + } + } + return result; + } + bool caseInsensitiveStringCompare(const QString &c1, const QString &c2) { return c1.length() == c2.length() && c1.startsWith(c2, Qt::CaseInsensitive); diff --git a/src/blackmisc/stringutils.h b/src/blackmisc/stringutils.h index be0f02405..dbdbec302 100644 --- a/src/blackmisc/stringutils.h +++ b/src/blackmisc/stringutils.h @@ -256,6 +256,9 @@ namespace BlackMisc //! Remove accents / diacritic marks from a string BLACKMISC_EXPORT QString simplifyAccents(const QString &candidate); + //! Remove accents / diacritic marks from a string by doing a Unicode decomposition and removing mark characters + BLACKMISC_EXPORT QString simplifyByDecomposition(const QString &candidate); + //! Case insensitive string compare BLACKMISC_EXPORT bool caseInsensitiveStringCompare(const QString &c1, const QString &c2); diff --git a/tests/blackmisc/teststringutils/teststringutils.cpp b/tests/blackmisc/teststringutils/teststringutils.cpp index 4c6093eb9..cfa87303a 100644 --- a/tests/blackmisc/teststringutils/teststringutils.cpp +++ b/tests/blackmisc/teststringutils/teststringutils.cpp @@ -36,6 +36,7 @@ namespace BlackMiscTest void testSplit(); void testTimestampParsing(); void testCodecs(); + void testSimplify(); }; void CTestStringUtils::testRemove() @@ -157,6 +158,14 @@ namespace BlackMiscTest QVERIFY2(okRu1, "Russian \"test\" equal after round-trip with cp1251"); QVERIFY2(okRu2, "Russian \"test\" equal after round-trip with utf8"); } + + void CTestStringUtils::testSimplify() + { + const QString input = QString::fromUtf8(u8"ŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüýÿ"); + const QString output = QLatin1String("SZszYAAAAAACEEEEIIIINOOOOOUUUUYaaaaaaceeeeiiiinooooouuuuyy"); + QCOMPARE(simplifyAccents(input), output); + QCOMPARE(simplifyByDecomposition(input), output); + } } //! main