Falkon Develop
Cross-platform Qt-based web browser
tldextractor.cpp
Go to the documentation of this file.
1/* ============================================================
2* TLDExtractor, a simple Qt interface to extract TLD part of a host
3* Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com>
4*
5* This program is free software: you can redistribute it and/or modify
6* it under the terms of the GNU General Public License as published by
7* the Free Software Foundation, either version 3 of the License, or
8* (at your option) any later version.
9*
10* This program is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13* GNU General Public License for more details.
14*
15* You should have received a copy of the GNU General Public License
16* along with this program. If not, see <http://www.gnu.org/licenses/>.
17* ============================================================ */
18#include "tldextractor.h"
19
20#include <QApplication>
21#include <QDebug>
22#include <QFileInfo>
23#include <QMessageBox>
24#include <QUrl>
25#include <QRegExp>
26
27TLDExtractor* TLDExtractor::s_instance = nullptr;
28
29TLDExtractor::TLDExtractor(QObject* parent)
30 : QObject(parent)
31{
32 setDataSearchPaths();
33}
34
35QStringList TLDExtractor::defaultDataSearchPaths()
36{
37 return QStringList() << QLatin1String(":/tldextractor/data");
38}
39
41{
42 if(s_instance == nullptr)
43 {
44 s_instance = new TLDExtractor(qApp);
45 }
46
47 return s_instance;
48}
49
51{
52 s_instance = nullptr;
53}
54
56{
57 return !m_tldHash.isEmpty();
58}
59
60QString TLDExtractor::TLD(const QString &host)
61{
62 if (host.isEmpty() || host.startsWith(QLatin1Char('.'))) {
63 return {};
64 }
65
66 QString cleanHost = normalizedHost(host);
67
68 QString tldPart = cleanHost.mid(cleanHost.lastIndexOf(QLatin1Char('.')) + 1);
69 cleanHost = QString::fromUtf8(QUrl::toAce(cleanHost));
70
71 loadData();
72
73 if (!m_tldHash.contains(tldPart)) {
74 return tldPart;
75 }
76
77 QStringList tldRules = m_tldHash.values(tldPart);
78
79 if (!tldRules.contains(tldPart)) {
80 tldRules << tldPart;
81 }
82
83 int maxLabelCount = 0;
84 bool isExceptionTLD = false;
85 bool isWildcardTLD = false;
86
87 for (QString rule : std::as_const(tldRules)) {
88 const int labelCount = rule.count(QLatin1Char('.')) + 1;
89
90 if (rule.startsWith(QLatin1Char('!'))) {
91 rule.remove(0, 1);
92
93 rule = QString::fromUtf8(QUrl::toAce(rule));
94 isExceptionTLD = true;
95
96 // matches with exception TLD
97 if (cleanHost.endsWith(rule)) {
98 tldPart = rule.mid(rule.indexOf(QLatin1Char('.')) + 1);
99 break;
100 }
101 }
102 else {
103 isExceptionTLD = false;
104 }
105
106 if (rule.startsWith(QLatin1Char('*'))) {
107 rule.remove(0, 1);
108
109 if (rule.startsWith(QLatin1Char('.'))) {
110 rule.remove(0, 1);
111 }
112
113 isWildcardTLD = true;
114 }
115 else {
116 isWildcardTLD = false;
117 }
118
119 Q_UNUSED(isExceptionTLD)
120
121 rule = QString::fromUtf8(QUrl::toAce(rule));
122 const QString testRule = QLatin1Char('.') + rule;
123 const QString testUrl = QLatin1Char('.') + cleanHost;
124
125 if (labelCount > maxLabelCount && testUrl.endsWith(testRule)) {
126 tldPart = rule;
127 maxLabelCount = labelCount;
128
129 if (isWildcardTLD) {
130 QString temp = cleanHost;
131 temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
132
133 if (temp.endsWith(QLatin1Char('.'))) {
134 temp.remove(temp.size() - 1, 1);
135 }
136
137 temp = temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1);
138
139 tldPart = temp.isEmpty() ? rule : (temp + QLatin1Char('.') + rule);
140 }
141 }
142 }
143
144 QString temp = normalizedHost(host);
145 tldPart = temp.section(QLatin1Char('.'), temp.count(QLatin1Char('.')) - tldPart.count(QLatin1Char('.')));
146
147 return tldPart;
148}
149
150QString TLDExtractor::domain(const QString &host)
151{
152 const QString tldPart = TLD(host);
153
154 return domainHelper(host, tldPart);
155}
156
157QString TLDExtractor::domainHelper(const QString &host, const QString &tldPart)
158{
159 if (host.isEmpty() || tldPart.isEmpty()) {
160 return {};
161 }
162
163 QString temp = normalizedHost(host);
164 temp.remove(temp.lastIndexOf(tldPart), tldPart.size());
165
166 if (temp.endsWith(QLatin1Char('.'))) {
167 temp.remove(temp.size() - 1, 1);
168 }
169
170 return temp.mid(temp.lastIndexOf(QLatin1Char('.')) + 1);
171}
172
173QString TLDExtractor::registrableDomainHelper(const QString &domainPart, const QString &tldPart)
174{
175 if (tldPart.isEmpty() || domainPart.isEmpty()) {
176 return {};
177 }
178 else {
179 return QStringLiteral("%1.%2").arg(domainPart, tldPart);
180 }
181}
182
183QString TLDExtractor::subdomainHelper(const QString &host, const QString &registrablePart)
184{
185 if (!registrablePart.isEmpty()) {
186 QString subdomain = normalizedHost(host);
187
188 subdomain.remove(subdomain.lastIndexOf(registrablePart), registrablePart.size());
189
190 if (subdomain.endsWith(QLatin1Char('.'))) {
191 subdomain.remove(subdomain.size() - 1, 1);
192 }
193
194 return subdomain;
195 }
196
197 return {};
198}
199
200QString TLDExtractor::registrableDomain(const QString &host)
201{
202 const QString tldPart = TLD(host);
203
204 return registrableDomainHelper(domainHelper(host, tldPart), tldPart);
205}
206
207QString TLDExtractor::subdomain(const QString &host)
208{
209 return subdomainHelper(host, registrableDomain(host));
210}
211
212// a light function that extract all parts with just one call to TLD()
214{
215 HostParts hostParts;
216
217 hostParts.host = host;
218 hostParts.tld = TLD(host);
219 hostParts.domain = domainHelper(host, hostParts.tld);
220 hostParts.registrableDomain = registrableDomainHelper(hostParts.domain, hostParts.tld);
221 hostParts.subdomain = subdomainHelper(host, hostParts.registrableDomain);
222
223 return hostParts;
224}
225
227{
228 return m_dataSearchPaths;
229}
230
231void TLDExtractor::setDataSearchPaths(const QStringList &searchPaths)
232{
233 m_dataSearchPaths = searchPaths;
234
235 m_dataSearchPaths << TLDExtractor::defaultDataSearchPaths();
236
237 m_dataSearchPaths.removeDuplicates();
238}
239
240void TLDExtractor::loadData()
241{
242 if (isDataLoaded()) {
243 return;
244 }
245
246 QString dataFileName;
247 bool parsedDataFileExist = false;
248
249 for (const QString &path : std::as_const(m_dataSearchPaths)) {
250 dataFileName = QFileInfo(path + QLatin1String("/effective_tld_names.dat")).absoluteFilePath();
251
252 if (QFileInfo::exists(dataFileName)) {
253 parsedDataFileExist = true;
254 break;
255 }
256 }
257
258
259 if (!parsedDataFileExist) {
260 const QString tldDataFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1");
261 QMessageBox::information(nullptr, tr("File not found!"),
262 tr("File \'effective_tld_names.dat\' was not found!\n"
263 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
264 .arg(tldDataFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n"))));
265
266 return;
267 }
268
269 m_dataFileName = dataFileName;
270
271 if (!parseData(dataFileName)) {
272 qWarning() << "TLDExtractor: There is some parse errors for file:" << dataFileName;
273 }
274}
275
276bool TLDExtractor::parseData(const QString &dataFile, bool loadPrivateDomains)
277{
278 m_tldHash.clear();
279
280 QFile file(dataFile);
281
282 if (!file.open(QFile::ReadOnly | QFile::Text)) {
283 return false;
284 }
285
286 bool seekToEndOfPrivateDomains = false;
287
288 while (!file.atEnd()) {
289 QString line = QString::fromUtf8(file.readLine().constData()).simplified();
290
291 if (line.isEmpty()) {
292 continue;
293 }
294
295 if (line.startsWith(QLatin1Char('.'))) {
296 line.remove(0, 1);
297 }
298
299
300 if (line.startsWith(QLatin1String("//"))) {
301 if (line.contains(QLatin1String("===END PRIVATE DOMAINS==="))) {
302 seekToEndOfPrivateDomains = false;
303 }
304
305 if (!loadPrivateDomains && line.contains(QLatin1String("===BEGIN PRIVATE DOMAINS==="))) {
306 if (m_tldHash.isEmpty()) {
307 seekToEndOfPrivateDomains = true;
308 }
309 else {
310 break;
311 }
312 }
313
314 continue;
315 }
316
317 if (seekToEndOfPrivateDomains) {
318 continue;
319 }
320
321 // Each line is only read up to the first whitespace
322 line = line.left(line.indexOf(QLatin1Char(' ')));
323
324 if (!line.contains(QLatin1Char('.'))) {
325 m_tldHash.insert(line, line);
326 }
327 else {
328 QString key = line.mid(line.lastIndexOf(QLatin1Char('.')) + 1);
329
330 m_tldHash.insert(key, line);
331 }
332 }
333
334 return isDataLoaded();
335}
336
337QString TLDExtractor::normalizedHost(const QString &host) const
338{
339 return host.toLower();
340}
341
342// methods for testing
344{
345 if (!parseData(m_dataFileName, true)) {
346 return false;
347 }
348
349 QString testDataFileName;
350 bool testDataFileExist = false;
351
352 for (const QString &path : std::as_const(m_dataSearchPaths)) {
353 testDataFileName = QFileInfo(path + QLatin1String("/test_psl.txt")).absoluteFilePath();
354
355 if (QFileInfo::exists(testDataFileName)) {
356 testDataFileExist = true;
357 break;
358 }
359 }
360
361 if (!testDataFileExist) {
362 const QString testFileDownloadLink = QLatin1String("http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1");
363
364 QMessageBox::information(nullptr, tr("File not found!"),
365 tr("File \'test_psl.txt\' was not found!\n"
366 "You can download it from \'<a href=\"%1\"><b>here</b></a>\' to one of the following paths:\n%2")
367 .arg(testFileDownloadLink, m_dataSearchPaths.join(QStringLiteral("\n"))));
368
369 return false;
370 }
371
372 QFile file(testDataFileName);
373
374 if (!file.open(QFile::ReadOnly | QFile::Text)) {
375 return false;
376 }
377
378 QRegExp testRegExp(QStringLiteral("checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);"));
379 bool allTestSuccess = true;
380
381 while (!file.atEnd()) {
382 QString line = QString::fromUtf8(file.readLine().constData()).simplified();
383
384 if (line.startsWith(QLatin1String("//")) || line.isEmpty()) {
385 continue;
386 }
387
388 testRegExp.indexIn(line);
389
390 const QString hostName = testRegExp.cap(2);
391 const QString registrableName = testRegExp.cap(4);
392
393 if (!checkPublicSuffix(hostName, registrableName)) {
394 allTestSuccess = false;
395 }
396 }
397
398 if (allTestSuccess) {
399 qWarning() << "TLDExtractor: Test passed successfully.";
400 }
401 else {
402 qWarning() << "TLDExtractor: Test finished with some errors!";
403 }
404
405 // reset cache for normal use
406 m_tldHash.clear();
407
408 return allTestSuccess;
409}
410
411bool TLDExtractor::checkPublicSuffix(const QString &hostName, const QString &registrableName)
412{
413 if (registrableDomain(hostName) != registrableName) {
414 qWarning() << "TLDExtractor Test Error: hostName:" << hostName
415 << "Correct registrableName:" << registrableName
416 << "Wrong registrableName:" << registrableDomain(hostName);
417
418 return false;
419 }
420
421 return true;
422}
QString subdomain(const QString &host)
static TLDExtractor * instance()
HostParts splitParts(const QString &host)
QStringList dataSearchPaths() const
QString registrableDomain(const QString &host)
bool isDataLoaded()
QString TLD(const QString &host)
void setDataSearchPaths(const QStringList &searchPaths=TLDExtractor::defaultDataSearchPaths())
QString domain(const QString &host)