20#include <QApplication>
29TLDExtractor::TLDExtractor(QObject* parent)
35QStringList TLDExtractor::defaultDataSearchPaths()
37 return QStringList() << QLatin1String(
":/tldextractor/data");
42 if(s_instance ==
nullptr)
57 return !m_tldHash.isEmpty();
62 if (host.isEmpty() || host.startsWith(QLatin1Char(
'.'))) {
66 QString cleanHost = normalizedHost(host);
68 QString tldPart = cleanHost.mid(cleanHost.lastIndexOf(QLatin1Char(
'.')) + 1);
69 cleanHost = QString::fromUtf8(QUrl::toAce(cleanHost));
73 if (!m_tldHash.contains(tldPart)) {
77 QStringList tldRules = m_tldHash.values(tldPart);
79 if (!tldRules.contains(tldPart)) {
83 int maxLabelCount = 0;
84 bool isExceptionTLD =
false;
85 bool isWildcardTLD =
false;
87 for (QString rule : std::as_const(tldRules)) {
88 const int labelCount = rule.count(QLatin1Char(
'.')) + 1;
90 if (rule.startsWith(QLatin1Char(
'!'))) {
93 rule = QString::fromUtf8(QUrl::toAce(rule));
94 isExceptionTLD =
true;
97 if (cleanHost.endsWith(rule)) {
98 tldPart = rule.mid(rule.indexOf(QLatin1Char(
'.')) + 1);
103 isExceptionTLD =
false;
106 if (rule.startsWith(QLatin1Char(
'*'))) {
109 if (rule.startsWith(QLatin1Char(
'.'))) {
113 isWildcardTLD =
true;
116 isWildcardTLD =
false;
119 Q_UNUSED(isExceptionTLD)
121 rule = QString::fromUtf8(QUrl::toAce(rule));
122 const QString testRule = QLatin1Char(
'.') + rule;
123 const QString testUrl = QLatin1Char(
'.') + cleanHost;
125 if (labelCount > maxLabelCount && testUrl.endsWith(testRule)) {
127 maxLabelCount = labelCount;
130 QString temp = cleanHost;
131 temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
133 if (temp.endsWith(QLatin1Char(
'.'))) {
134 temp.remove(temp.size() - 1, 1);
137 temp = temp.mid(temp.lastIndexOf(QLatin1Char(
'.')) + 1);
139 tldPart = temp.isEmpty() ? rule : (temp + QLatin1Char(
'.') + rule);
144 QString temp = normalizedHost(host);
145 tldPart = temp.section(QLatin1Char(
'.'), temp.count(QLatin1Char(
'.')) - tldPart.count(QLatin1Char(
'.')));
152 const QString tldPart =
TLD(host);
154 return domainHelper(host, tldPart);
157QString TLDExtractor::domainHelper(
const QString &host,
const QString &tldPart)
159 if (host.isEmpty() || tldPart.isEmpty()) {
163 QString temp = normalizedHost(host);
164 temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
166 if (temp.endsWith(QLatin1Char(
'.'))) {
167 temp.remove(temp.size() - 1, 1);
170 return temp.mid(temp.lastIndexOf(QLatin1Char(
'.')) + 1);
173QString TLDExtractor::registrableDomainHelper(
const QString &domainPart,
const QString &tldPart)
175 if (tldPart.isEmpty() || domainPart.isEmpty()) {
179 return QStringLiteral(
"%1.%2").arg(domainPart, tldPart);
183QString TLDExtractor::subdomainHelper(
const QString &host,
const QString ®istrablePart)
185 if (!registrablePart.isEmpty()) {
186 QString
subdomain = normalizedHost(host);
190 if (
subdomain.endsWith(QLatin1Char(
'.'))) {
202 const QString tldPart =
TLD(host);
204 return registrableDomainHelper(domainHelper(host, tldPart), tldPart);
217 hostParts.
host = host;
218 hostParts.
tld =
TLD(host);
219 hostParts.
domain = domainHelper(host, hostParts.
tld);
228 return m_dataSearchPaths;
233 m_dataSearchPaths = searchPaths;
235 m_dataSearchPaths << TLDExtractor::defaultDataSearchPaths();
237 m_dataSearchPaths.removeDuplicates();
240void TLDExtractor::loadData()
246 QString dataFileName;
247 bool parsedDataFileExist =
false;
249 for (
const QString &path : std::as_const(m_dataSearchPaths)) {
250 dataFileName = QFileInfo(path + QLatin1String(
"/effective_tld_names.dat")).absoluteFilePath();
252 if (QFileInfo::exists(dataFileName)) {
253 parsedDataFileExist =
true;
259 if (!parsedDataFileExist) {
260 const QString tldDataFileDownloadLink = QLatin1String(
"http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1");
261 QMessageBox::information(
nullptr, tr(
"File not found!"),
262 tr(
"File \'effective_tld_names.dat\' was not found!\n"
263 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
264 .arg(tldDataFileDownloadLink, m_dataSearchPaths.join(QStringLiteral(
"\n"))));
269 m_dataFileName = dataFileName;
271 if (!parseData(dataFileName)) {
272 qWarning() <<
"TLDExtractor: There is some parse errors for file:" << dataFileName;
276bool TLDExtractor::parseData(
const QString &dataFile,
bool loadPrivateDomains)
280 QFile file(dataFile);
282 if (!file.open(QFile::ReadOnly | QFile::Text)) {
286 bool seekToEndOfPrivateDomains =
false;
288 while (!file.atEnd()) {
289 QString line = QString::fromUtf8(file.readLine().constData()).simplified();
291 if (line.isEmpty()) {
295 if (line.startsWith(QLatin1Char(
'.'))) {
300 if (line.startsWith(QLatin1String(
"//"))) {
301 if (line.contains(QLatin1String(
"===END PRIVATE DOMAINS==="))) {
302 seekToEndOfPrivateDomains =
false;
305 if (!loadPrivateDomains && line.contains(QLatin1String(
"===BEGIN PRIVATE DOMAINS==="))) {
306 if (m_tldHash.isEmpty()) {
307 seekToEndOfPrivateDomains =
true;
317 if (seekToEndOfPrivateDomains) {
322 line = line.left(line.indexOf(QLatin1Char(
' ')));
324 if (!line.contains(QLatin1Char(
'.'))) {
325 m_tldHash.insert(line, line);
328 QString key = line.mid(line.lastIndexOf(QLatin1Char(
'.')) + 1);
330 m_tldHash.insert(key, line);
337QString TLDExtractor::normalizedHost(
const QString &host)
const
339 return host.toLower();
345 if (!parseData(m_dataFileName,
true)) {
349 QString testDataFileName;
350 bool testDataFileExist =
false;
352 for (
const QString &path : std::as_const(m_dataSearchPaths)) {
353 testDataFileName = QFileInfo(path + QLatin1String(
"/test_psl.txt")).absoluteFilePath();
355 if (QFileInfo::exists(testDataFileName)) {
356 testDataFileExist =
true;
361 if (!testDataFileExist) {
362 const QString testFileDownloadLink = QLatin1String(
"http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1");
364 QMessageBox::information(
nullptr, tr(
"File not found!"),
365 tr(
"File \'test_psl.txt\' was not found!\n"
366 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
367 .arg(testFileDownloadLink, m_dataSearchPaths.join(QStringLiteral(
"\n"))));
372 QFile file(testDataFileName);
374 if (!file.open(QFile::ReadOnly | QFile::Text)) {
378 QRegExp testRegExp(QStringLiteral(
"checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);"));
379 bool allTestSuccess =
true;
381 while (!file.atEnd()) {
382 QString line = QString::fromUtf8(file.readLine().constData()).simplified();
384 if (line.startsWith(QLatin1String(
"//")) || line.isEmpty()) {
388 testRegExp.indexIn(line);
390 const QString hostName = testRegExp.cap(2);
391 const QString registrableName = testRegExp.cap(4);
393 if (!checkPublicSuffix(hostName, registrableName)) {
394 allTestSuccess =
false;
398 if (allTestSuccess) {
399 qWarning() <<
"TLDExtractor: Test passed successfully.";
402 qWarning() <<
"TLDExtractor: Test finished with some errors!";
408 return allTestSuccess;
411bool TLDExtractor::checkPublicSuffix(
const QString &hostName,
const QString ®istrableName)
414 qWarning() <<
"TLDExtractor Test Error: hostName:" << hostName
415 <<
"Correct registrableName:" << registrableName