String simplification (accent removal) using Unicode decomposition

This commit is contained in:
Mat Sutcliffe
2019-07-26 21:10:11 +01:00
parent cb8eb6569e
commit dd86be3335
3 changed files with 32 additions and 0 deletions

View File

@@ -243,6 +243,26 @@ namespace BlackMisc
return output;
}
QString simplifyByDecomposition(const QString &s)
{
QString result;
for (const QChar c : s)
{
if (c.decompositionTag() == QChar::NoDecomposition)
{
result.push_back(c);
}
else
{
for (const QChar dc : c.decomposition())
{
if (!dc.isMark()) { result.push_back(dc); }
}
}
}
return result;
}
bool caseInsensitiveStringCompare(const QString &c1, const QString &c2)
{
return c1.length() == c2.length() && c1.startsWith(c2, Qt::CaseInsensitive);

View File

@@ -256,6 +256,9 @@ namespace BlackMisc
//! Remove accents / diacritic marks from a string
BLACKMISC_EXPORT QString simplifyAccents(const QString &candidate);
//! Remove accents / diacritic marks from a string by doing a Unicode decomposition and removing mark characters
BLACKMISC_EXPORT QString simplifyByDecomposition(const QString &candidate);
//! Case insensitive string compare
BLACKMISC_EXPORT bool caseInsensitiveStringCompare(const QString &c1, const QString &c2);

View File

@@ -36,6 +36,7 @@ namespace BlackMiscTest
void testSplit();
void testTimestampParsing();
void testCodecs();
void testSimplify();
};
void CTestStringUtils::testRemove()
@@ -157,6 +158,14 @@ namespace BlackMiscTest
QVERIFY2(okRu1, "Russian \"test\" equal after round-trip with cp1251");
QVERIFY2(okRu2, "Russian \"test\" equal after round-trip with utf8");
}
void CTestStringUtils::testSimplify()
{
const QString input = QString::fromUtf8(u8"ŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüýÿ");
const QString output = QLatin1String("SZszYAAAAAACEEEEIIIINOOOOOUUUUYaaaaaaceeeeiiiinooooouuuuyy");
QCOMPARE(simplifyAccents(input), output);
QCOMPARE(simplifyByDecomposition(input), output);
}
}
//! main