Skip to content

Commit da74f6b

Browse files
committed
fix: improved robots.txt exclude matching
1 parent 8934952 commit da74f6b

File tree

4 files changed

+107
-46
lines changed

4 files changed

+107
-46
lines changed

packages/core/src/discovery/robotsTxt.ts

+68-45
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,6 @@ export interface RobotsTxtParsed {
1010
groups: RobotsGroupResolved[]
1111
}
1212

13-
function isValidRegex(s: string | RegExp) {
14-
if (typeof s === 'string') {
15-
// make sure it's valid regex
16-
try {
17-
// eslint-disable-next-line no-new
18-
new RegExp(s)
19-
return true
20-
}
21-
catch (e) {
22-
return false
23-
}
24-
}
25-
return true
26-
}
2713
/**
2814
* Fetches the robots.txt file.
2915
* @param site
@@ -46,40 +32,77 @@ export async function fetchRobotsTxt(site: string): Promise<false | string> {
4632
return robotsTxt.response.data as string
4733
}
4834

49-
export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
50-
const normalisedGroups = groups
51-
.filter(group => group.userAgent.includes('*'))
52-
.map((group) => {
53-
for (const k of ['disallow', 'allow']) {
54-
// @ts-expect-error untyped
55-
group[k] = (group[k] as string[])
56-
// skip any disallows that are root level
57-
.filter(path => path !== '/' && path)
58-
.map((path) => {
59-
// convert robots.txt paths to regex paths
60-
if (path.includes('*'))
61-
path = path.replace(/\*/g, '.*')
62-
else
63-
path = `${path}.*`
64-
return path
65-
})
35+
interface RobotsTxtRule { pattern: string, allow: boolean }
36+
37+
function matches(pattern: string, path: string): boolean {
38+
const pathLength = path.length
39+
const patternLength = pattern.length
40+
const matchingLengths: number[] = Array.from({ length: pathLength + 1 }).fill(0)
41+
let numMatchingLengths = 1
42+
43+
let p = 0
44+
while (p < patternLength) {
45+
if (pattern[p] === '$' && p + 1 === patternLength) {
46+
return matchingLengths[numMatchingLengths - 1] === pathLength
47+
}
48+
49+
if (pattern[p] === '*') {
50+
numMatchingLengths = pathLength - matchingLengths[0] + 1
51+
for (let i = 1; i < numMatchingLengths; i++) {
52+
matchingLengths[i] = matchingLengths[i - 1] + 1
53+
}
54+
}
55+
else {
56+
let numMatches = 0
57+
for (let i = 0; i < numMatchingLengths; i++) {
58+
const matchLength = matchingLengths[i]
59+
if (matchLength < pathLength && path[matchLength] === pattern[p]) {
60+
matchingLengths[numMatches++] = matchLength + 1
61+
}
62+
}
63+
if (numMatches === 0) {
64+
return false
6665
}
67-
return group
68-
})
66+
numMatchingLengths = numMatches
67+
}
68+
p++
69+
}
70+
71+
return true
72+
}
73+
export function matchPathToRule(path: string, _rules: RobotsTxtRule[]): RobotsTxtRule | null {
74+
let matchedRule: RobotsTxtRule | null = null
75+
76+
const rules = _rules.filter(Boolean) // filter out empty line such as Disallow:
77+
const rulesLength = rules.length
78+
let i = 0
79+
while (i < rulesLength) {
80+
const rule = rules[i]
81+
if (!matches(rule.pattern, path)) {
82+
i++
83+
continue
84+
}
6985

70-
// for diallow we add it to the exclude list
71-
config.scanner.exclude = [...new Set([
72-
...(config.scanner.exclude || []),
73-
...normalisedGroups.flatMap(group => group.disallow),
74-
])].filter(isValidRegex)
75-
config.scanner.include = config.scanner.include || []
76-
const robotsAllows = normalisedGroups.flatMap(group => group.allow).filter(a => a.length)
77-
if (!config.scanner.include.length && robotsAllows.length) {
78-
config.scanner.include = [...new Set([
79-
'/*',
80-
...normalisedGroups.flatMap(group => group.allow),
81-
])].filter(isValidRegex)
86+
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
87+
matchedRule = rule
88+
}
89+
else if (
90+
rule.pattern.length === matchedRule.pattern.length
91+
&& rule.allow
92+
&& !matchedRule.allow
93+
) {
94+
matchedRule = rule
95+
}
96+
i++
8297
}
98+
99+
return matchedRule
100+
}
101+
102+
export function mergeRobotsTxtConfig(config: ResolvedUserConfig, { groups, sitemaps }: RobotsTxtParsed): void {
103+
config.scanner._robotsTxtRules = groups.filter((group) => {
104+
return group.userAgent.includes('*') || group.userAgent.includes(String(config.lighthouseOptions?.emulatedUserAgent))
105+
}).map(group => group._rules)
83106
if (config.scanner.sitemap !== false && sitemaps.length) {
84107
// allow overriding the robots.txt sitemaps with your own
85108
if (!Array.isArray(config.scanner.sitemap) || !config.scanner.sitemap.length)

packages/core/src/puppeteer/worker.ts

+10
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import fs from 'node:fs'
1212
import { join } from 'node:path'
1313
import chalk from 'chalk'
1414
import { get, sortBy, uniqBy } from 'lodash-es'
15+
import { matchPathToRule } from '../discovery'
1516
import { useLogger } from '../logger'
1617
import { useUnlighthouse } from '../unlighthouse'
1718
import { createTaskReportFromRoute, formatBytes, ReportArtifacts } from '../util'
@@ -94,6 +95,15 @@ export async function createUnlighthouseWorker(tasks: Record<UnlighthouseTask, T
9495
if (ignoredRoutes.has(id))
9596
return
9697

98+
// do robots.txt test
99+
if (resolvedConfig.scanner.robotsTxt) {
100+
const rule = matchPathToRule(path, resolvedConfig.scanner._robotsTxtRules)
101+
if (rule && !rule.allow) {
102+
logger.info(`Skipping route based on robots.txt rule \`${rule.pattern}\``, { path })
103+
return
104+
}
105+
}
106+
97107
if (resolvedConfig.scanner.include || resolvedConfig.scanner.exclude) {
98108
const filter = createFilter(resolvedConfig.scanner)
99109
if (!filter(path)) {

packages/core/src/types.ts

+5
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,11 @@ export interface ResolvedUserConfig {
465465
* @default 'mobile'
466466
*/
467467
device: 'mobile' | 'desktop' | false
468+
/**
469+
* Resolved robots.txt groups.
470+
* @internal
471+
*/
472+
_robotsTxtRules?: any
468473
}
469474
/**
470475
* Changes the default behaviour of lighthouse.

packages/core/src/util/robotsTxtParser.ts

+24-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ export interface RobotsGroupResolved {
44
allow: string[]
55
userAgent: string[]
66
host?: string
7+
// runtime optimization
8+
_indexable: boolean
9+
_rules: { pattern: string, allow: boolean }[]
710
}
811

912
/**
@@ -78,7 +81,27 @@ export function parseRobotsTxt(s: string) {
7881
...currentGroup,
7982
})
8083
return {
81-
groups,
84+
groups: groups.map(normalizeGroup),
8285
sitemaps,
8386
}
8487
}
88+
89+
function asArray(v: any) {
90+
return typeof v === 'undefined' ? [] : (Array.isArray(v) ? v : [v])
91+
}
92+
93+
function normalizeGroup(group: RobotsGroupResolved): RobotsGroupResolved {
94+
const disallow = asArray(group.disallow) // we can have empty disallow
95+
const allow = asArray(group.allow).filter(rule => Boolean(rule))
96+
return <RobotsGroupResolved> {
97+
...group,
98+
userAgent: group.userAgent ? asArray(group.userAgent) : ['*'],
99+
disallow,
100+
allow,
101+
_indexable: !disallow.includes((rule: string) => rule === '/'),
102+
_rules: [
103+
...disallow.filter(Boolean).map(r => ({ pattern: r, allow: false })),
104+
...allow.map(r => ({ pattern: r, allow: true })),
105+
],
106+
}
107+
}

0 commit comments

Comments
 (0)