@@ -10,20 +10,6 @@ export interface RobotsTxtParsed {
10
10
groups : RobotsGroupResolved [ ]
11
11
}
12
12
13
- function isValidRegex ( s : string | RegExp ) {
14
- if ( typeof s === 'string' ) {
15
- // make sure it's valid regex
16
- try {
17
- // eslint-disable-next-line no-new
18
- new RegExp ( s )
19
- return true
20
- }
21
- catch ( e ) {
22
- return false
23
- }
24
- }
25
- return true
26
- }
27
13
/**
28
14
* Fetches the robots.txt file.
29
15
* @param site
@@ -46,40 +32,77 @@ export async function fetchRobotsTxt(site: string): Promise<false | string> {
46
32
return robotsTxt . response . data as string
47
33
}
48
34
49
- export function mergeRobotsTxtConfig ( config : ResolvedUserConfig , { groups, sitemaps } : RobotsTxtParsed ) : void {
50
- const normalisedGroups = groups
51
- . filter ( group => group . userAgent . includes ( '*' ) )
52
- . map ( ( group ) => {
53
- for ( const k of [ 'disallow' , 'allow' ] ) {
54
- // @ts -expect-error untyped
55
- group [ k ] = ( group [ k ] as string [ ] )
56
- // skip any disallows that are root level
57
- . filter ( path => path !== '/' && path )
58
- . map ( ( path ) => {
59
- // convert robots.txt paths to regex paths
60
- if ( path . includes ( '*' ) )
61
- path = path . replace ( / \* / g, '.*' )
62
- else
63
- path = `${ path } .*`
64
- return path
65
- } )
35
+ interface RobotsTxtRule { pattern : string , allow : boolean }
36
+
37
+ function matches ( pattern : string , path : string ) : boolean {
38
+ const pathLength = path . length
39
+ const patternLength = pattern . length
40
+ const matchingLengths : number [ ] = Array . from ( { length : pathLength + 1 } ) . fill ( 0 )
41
+ let numMatchingLengths = 1
42
+
43
+ let p = 0
44
+ while ( p < patternLength ) {
45
+ if ( pattern [ p ] === '$' && p + 1 === patternLength ) {
46
+ return matchingLengths [ numMatchingLengths - 1 ] === pathLength
47
+ }
48
+
49
+ if ( pattern [ p ] === '*' ) {
50
+ numMatchingLengths = pathLength - matchingLengths [ 0 ] + 1
51
+ for ( let i = 1 ; i < numMatchingLengths ; i ++ ) {
52
+ matchingLengths [ i ] = matchingLengths [ i - 1 ] + 1
53
+ }
54
+ }
55
+ else {
56
+ let numMatches = 0
57
+ for ( let i = 0 ; i < numMatchingLengths ; i ++ ) {
58
+ const matchLength = matchingLengths [ i ]
59
+ if ( matchLength < pathLength && path [ matchLength ] === pattern [ p ] ) {
60
+ matchingLengths [ numMatches ++ ] = matchLength + 1
61
+ }
62
+ }
63
+ if ( numMatches === 0 ) {
64
+ return false
66
65
}
67
- return group
68
- } )
66
+ numMatchingLengths = numMatches
67
+ }
68
+ p ++
69
+ }
70
+
71
+ return true
72
+ }
73
+ export function matchPathToRule ( path : string , _rules : RobotsTxtRule [ ] ) : RobotsTxtRule | null {
74
+ let matchedRule : RobotsTxtRule | null = null
75
+
76
+ const rules = _rules . filter ( Boolean ) // filter out empty line such as Disallow:
77
+ const rulesLength = rules . length
78
+ let i = 0
79
+ while ( i < rulesLength ) {
80
+ const rule = rules [ i ]
81
+ if ( ! matches ( rule . pattern , path ) ) {
82
+ i ++
83
+ continue
84
+ }
69
85
70
- // for diallow we add it to the exclude list
71
- config . scanner . exclude = [ ...new Set ( [
72
- ...( config . scanner . exclude || [ ] ) ,
73
- ...normalisedGroups . flatMap ( group => group . disallow ) ,
74
- ] ) ] . filter ( isValidRegex )
75
- config . scanner . include = config . scanner . include || [ ]
76
- const robotsAllows = normalisedGroups . flatMap ( group => group . allow ) . filter ( a => a . length )
77
- if ( ! config . scanner . include . length && robotsAllows . length ) {
78
- config . scanner . include = [ ...new Set ( [
79
- '/*' ,
80
- ...normalisedGroups . flatMap ( group => group . allow ) ,
81
- ] ) ] . filter ( isValidRegex )
86
+ if ( ! matchedRule || rule . pattern . length > matchedRule . pattern . length ) {
87
+ matchedRule = rule
88
+ }
89
+ else if (
90
+ rule . pattern . length === matchedRule . pattern . length
91
+ && rule . allow
92
+ && ! matchedRule . allow
93
+ ) {
94
+ matchedRule = rule
95
+ }
96
+ i ++
82
97
}
98
+
99
+ return matchedRule
100
+ }
101
+
102
+ export function mergeRobotsTxtConfig ( config : ResolvedUserConfig , { groups, sitemaps } : RobotsTxtParsed ) : void {
103
+ config . scanner . _robotsTxtRules = groups . filter ( ( group ) => {
104
+ return group . userAgent . includes ( '*' ) || group . userAgent . includes ( String ( config . lighthouseOptions ?. emulatedUserAgent ) )
105
+ } ) . map ( group => group . _rules )
83
106
if ( config . scanner . sitemap !== false && sitemaps . length ) {
84
107
// allow overriding the robots.txt sitemaps with your own
85
108
if ( ! Array . isArray ( config . scanner . sitemap ) || ! config . scanner . sitemap . length )
0 commit comments