Skip to content

Commit 24e134d

Browse files
committed
fix: better checks for implicit html routes
Fixes #231 Fixes #225
1 parent da74f6b commit 24e134d

File tree

4 files changed

+44
-18
lines changed

4 files changed

+44
-18
lines changed

packages/core/src/puppeteer/tasks/html.ts

+3-14
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ import { withoutTrailingSlash } from 'ufo'
88
import { useLogger } from '../../logger'
99
import { normaliseRoute } from '../../router'
1010
import { useUnlighthouse } from '../../unlighthouse'
11-
import { fetchUrlRaw, ReportArtifacts, trimSlashes } from '../../util'
11+
import { fetchUrlRaw, ReportArtifacts } from '../../util'
12+
import { isImplicitOrExplicitHtml } from '../../util/filter'
1213
import { setupPage } from '../util'
1314

1415
export const extractHtmlPayload: (page: Page, route: string) => Promise<{ success: boolean, redirected?: false | string, message?: string, payload?: string }> = async (page, route) => {
@@ -150,21 +151,9 @@ export const inspectHtmlTask: PuppeteerTask = async (props) => {
150151
$('a').each(function () {
151152
const href = $(this).attr('href')
152153
// href must be provided and not be javascript
153-
if (!href || href.includes('javascript:') || href.includes('mailto:') || href === '#')
154+
if (!href || href.includes('javascript:') || href.includes('mailto:') || href === '#' || !isImplicitOrExplicitHtml(href))
154155
return
155156

156-
// if the URL doesn't end with a slash we may be dealing with a file
157-
if (!href.endsWith('/')) {
158-
// need to check for a dot, meaning a file
159-
const parts = href.split('.')
160-
// 1 part means there is no extension, or no dot in the url
161-
if (parts.length > 1) {
162-
// presumably the last part will be the extension
163-
const extension = trimSlashes(parts[parts.length - 1]).replace('.', '')
164-
if (extension !== 'html')
165-
return
166-
}
167-
}
168157
if ((href.startsWith('/') && !href.startsWith('//')) || href.includes(resolvedConfig.site))
169158
internalLinks.push(href)
170159
else

packages/core/src/puppeteer/worker.ts

+2-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import { matchPathToRule } from '../discovery'
1616
import { useLogger } from '../logger'
1717
import { useUnlighthouse } from '../unlighthouse'
1818
import { createTaskReportFromRoute, formatBytes, ReportArtifacts } from '../util'
19-
import { createFilter } from '../util/filter'
19+
import { createFilter, isImplicitOrExplicitHtml } from '../util/filter'
2020
import {
2121
launchPuppeteerCluster,
2222
} from './cluster'
@@ -116,9 +116,7 @@ export async function createUnlighthouseWorker(tasks: Record<UnlighthouseTask, T
116116
}
117117
}
118118

119-
const lastPathSegment = path.split('/').pop() || path
120-
const extension = (lastPathSegment.includes('.') ? lastPathSegment.split('.').pop() : 'html') || 'html'
121-
if (!extension.includes('html')) {
119+
if (isImplicitOrExplicitHtml(path)) {
122120
logger.debug('Skipping non-HTML file from scanning', { path })
123121
return
124122
}

packages/core/src/util/filter.ts

+22
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,25 @@ export function createFilter(options: CreateFilterOptions = {}): (path: string)
3838
return include.length === 0
3939
}
4040
}
41+
42+
// types of file extensions that would return a HTML mime type
43+
const HTML_EXPLICIT_EXTENSIONS = [
44+
// html
45+
'.html',
46+
'.htm',
47+
// php
48+
'.php',
49+
// asp
50+
'.asp',
51+
'.aspx',
52+
]
53+
const FILE_MATCH_REGEX = /\.([0-9a-z])+$/i
54+
55+
export function isImplicitOrExplicitHtml(path: string): boolean {
56+
const lastPathSegment = path.split('/').pop() || path
57+
// if it ends with a slash, then we assume it's a index HTML
58+
if (lastPathSegment.endsWith('/'))
59+
return true // implicit
60+
const extension = lastPathSegment?.match(FILE_MATCH_REGEX)?.[0]
61+
return !extension || HTML_EXPLICIT_EXTENSIONS.includes(extension)
62+
}

packages/core/test/filters.test.ts

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { isImplicitOrExplicitHtml } from '../src/util/filter'
3+
4+
describe('filters', () => {
5+
it ('misc file paths', () => {
6+
expect(isImplicitOrExplicitHtml('')).toBe(true)
7+
expect(isImplicitOrExplicitHtml('/')).toBe(true)
8+
expect(isImplicitOrExplicitHtml('/some.foo/test')).toBe(true)
9+
expect(isImplicitOrExplicitHtml('/some/file.pdf/')).toBe(true)
10+
expect(isImplicitOrExplicitHtml('/dist/assets/chunk[213.4.931294]')).toBe(true)
11+
12+
// file paths
13+
expect(isImplicitOrExplicitHtml('/foo/bar.fr9f9')).toBe(false)
14+
expect(isImplicitOrExplicitHtml('/some/file.pdf')).toBe(false)
15+
expect(isImplicitOrExplicitHtml('/dist/assets/chunk[213.4.931294].css')).toBe(false)
16+
})
17+
})

0 commit comments

Comments
 (0)