Skip to content

Commit

Permalink
squashed commit
Browse files Browse the repository at this point in the history
- add FetchResolveCache
- pipe in headers arg
- provide full context in Link.comes_from
- pull in etag and date and cache the outputs
- remove cache_link_parsing
- introduce persistent cache for link parsing
- cache link evaluation when possible (further speedup, somehow)
- handle --no-cache-dir
  • Loading branch information
cosmicexplorer committed Sep 3, 2023
1 parent 04d4030 commit b658989
Show file tree
Hide file tree
Showing 7 changed files with 497 additions and 113 deletions.
80 changes: 58 additions & 22 deletions src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Type

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
Expand Down Expand Up @@ -41,7 +41,9 @@ def __init__(self, cache_dir: str) -> None:
assert not cache_dir or os.path.isabs(cache_dir)
self.cache_dir = cache_dir or None

def _get_cache_path_parts(self, link: Link) -> List[str]:
def _get_cache_path_parts(
self, link: Link, *, interpreter_dependent: bool
) -> List[str]:
"""Get parts of part that must be os.path.joined with cache_dir"""

# We want to generate an url to use as our cache key, we don't want to
Expand All @@ -53,13 +55,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
if link.subdirectory_fragment:
key_parts["subdirectory"] = link.subdirectory_fragment

# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()
if interpreter_dependent:
# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()

# Encode our key url with sha224, we'll use this because it has similar
# security properties to sha256, but with a shorter total output (and
Expand Down Expand Up @@ -87,26 +90,47 @@ class LinkMetadataCache(Cache):
"""Persistently store the metadata of dists found at each link."""

def get_path_for_link(self, link: Link) -> str:
parts = self._get_cache_path_parts(link)
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
return os.path.join(self.cache_dir, "link-metadata", *parts)


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""
class SerializableEntry(abc.ABC):
@classmethod
@abc.abstractmethod
def suffix(cls) -> str:
...

@abc.abstractmethod
def get(
self,
link: Link,
package_name: Optional[str],
supported_tags: List[Tag],
) -> Link:
"""Returns a link to a cached item if it exists, otherwise returns the
passed link.
"""
def serialize(self) -> Dict[str, Any]:
...


class FetchResolveCache(Cache):
def get_path_for_link(self, link: Link) -> str:
# We are reading index links to extract other links from, not executing any
# python code, so these caches are interpreter-independent.
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
assert self.cache_dir
return os.path.join(self.cache_dir, "fetch-resolve", *parts)

def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
hashed = _hash_dict(entry.serialize())
return self.cache_path(link) / f"{hashed}{entry.suffix()}"

def clear_hashed_entries(
self, link: Link, entry_type: Type[SerializableEntry]
) -> None:
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
logger.debug(
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
)
hashed_entry.unlink()


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""

def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
can_not_cache = not self.cache_dir or not canonical_package_name or not link
if can_not_cache:
Expand All @@ -119,6 +143,18 @@ def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
candidates.append((candidate, path))
return candidates

@abc.abstractmethod
def get(
self,
link: Link,
package_name: Optional[str],
supported_tags: List[Tag],
) -> Link:
"""Returns a link to a cached item if it exists, otherwise returns the
passed link.
"""
...


class SimpleWheelCache(WheelCacheBase):
"""A cache of wheels for future installs."""
Expand All @@ -141,7 +177,7 @@ def get_path_for_link(self, link: Link) -> str:
:param link: The link of the sdist for which this will cache wheels.
"""
parts = self._get_cache_path_parts(link)
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
# Store wheels within the root cache_dir
return os.path.join(self.cache_dir, "wheels", *parts)
Expand Down
7 changes: 6 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import Values
from typing import TYPE_CHECKING, Any, List, Optional, Tuple

from pip._internal.cache import LinkMetadataCache, WheelCache
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
from pip._internal.cli import cmdoptions
from pip._internal.cli.base_command import Command
from pip._internal.cli.command_context import CommandContextMixIn
Expand Down Expand Up @@ -506,8 +506,13 @@ def _build_package_finder(
ignore_requires_python=ignore_requires_python,
)

if options.cache_dir:
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
else:
fetch_resolve_cache = None
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
fetch_resolve_cache=fetch_resolve_cache,
)
100 changes: 35 additions & 65 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import collections
import email.message
import functools
import itertools
import json
import logging
Expand Down Expand Up @@ -96,7 +95,9 @@ class _NotHTTP(Exception):
pass


def _ensure_api_response(url: str, session: PipSession) -> None:
def _ensure_api_response(
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> None:
"""
Send a HEAD request to the URL, and ensure the response contains a simple
API Response.
Expand All @@ -108,13 +109,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
if scheme not in {"http", "https"}:
raise _NotHTTP()

resp = session.head(url, allow_redirects=True)
resp = session.head(url, allow_redirects=True, headers=headers)
raise_for_status(resp)

_ensure_api_header(resp)


def _get_simple_response(url: str, session: PipSession) -> Response:
def _get_simple_response(
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> Response:
"""Access an Simple API response with GET, and return the response.
This consists of three parts:
Expand All @@ -128,10 +131,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
and raise `_NotAPIContent` otherwise.
"""
if is_archive_file(Link(url).filename):
_ensure_api_response(url, session=session)
_ensure_api_response(url, session=session, headers=headers)

logger.debug("Getting page %s", redact_auth_from_url(url))

logger.debug("headers: %s", str(headers))
if headers is None:
headers = {}
resp = session.get(
url,
headers={
Expand All @@ -156,6 +162,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
**headers,
},
)
raise_for_status(resp)
Expand Down Expand Up @@ -188,44 +195,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None


class CacheablePageContent:
def __init__(self, page: "IndexContent") -> None:
assert page.cache_link_parsing
self.page = page

def __eq__(self, other: object) -> bool:
return isinstance(other, type(self)) and self.page.url == other.page.url

def __hash__(self) -> int:
return hash(self.page.url)


class ParseLinks(Protocol):
def __call__(self, page: "IndexContent") -> Iterable[Link]:
...


def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
"""
Given a function that parses an Iterable[Link] from an IndexContent, cache the
function's result (keyed by CacheablePageContent), unless the IndexContent
`page` has `page.cache_link_parsing == False`.
"""

@functools.lru_cache(maxsize=None)
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(page: "IndexContent") -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))

return wrapper_wrapper


@with_cached_index_content
def parse_links(page: "IndexContent") -> Iterable[Link]:
"""
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
Expand All @@ -235,7 +204,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
link = Link.from_json(file, page.url)
link = Link.from_json(file, page.url, page_content=page)
if link is None:
continue
yield link
Expand All @@ -248,7 +217,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = Link.from_element(anchor, page_url=url, base_url=base_url)
link = Link.from_element(
anchor, page_url=url, base_url=base_url, page_content=page
)
if link is None:
continue
yield link
Expand All @@ -263,20 +234,19 @@ def __init__(
content_type: str,
encoding: Optional[str],
url: str,
cache_link_parsing: bool = True,
etag: Optional[str] = None,
date: Optional[str] = None,
) -> None:
"""
:param encoding: the encoding to decode the given content.
:param url: the URL from which the HTML was downloaded.
:param cache_link_parsing: whether links parsed from this page's url
should be cached. PyPI index urls should
have this set to False, for example.
"""
self.content = content
self.content_type = content_type
self.encoding = encoding
self.url = url
self.cache_link_parsing = cache_link_parsing
self.etag = etag
self.date = date

def __str__(self) -> str:
return redact_auth_from_url(self.url)
Expand Down Expand Up @@ -320,21 +290,22 @@ def _handle_get_simple_fail(
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _make_index_content(
response: Response, cache_link_parsing: bool = True
) -> IndexContent:
def _make_index_content(response: Response) -> IndexContent:
encoding = _get_encoding_from_headers(response.headers)
return IndexContent(
response.content,
response.headers["Content-Type"],
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
etag=response.headers.get("ETag", None),
date=response.headers.get("Date", None),
)


def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
url = link.url.split("#", 1)[0]
def _get_index_content(
link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> Optional["IndexContent"]:
url = link.url_without_fragment

# Check for VCS schemes that do not support lookup as web pages.
vcs_scheme = _match_vcs_scheme(url)
Expand All @@ -361,7 +332,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
logger.debug(" file: URL is directory, getting %s", url)

try:
resp = _get_simple_response(url, session=session)
resp = _get_simple_response(url, session=session, headers=headers)
except _NotHTTP:
logger.warning(
"Skipping page %s because it looks like an archive, and cannot "
Expand All @@ -377,9 +348,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
exc.request_desc,
exc.content_type,
)
except NetworkConnectionError as exc:
_handle_get_simple_fail(link, exc)
except RetryError as exc:
except (NetworkConnectionError, RetryError) as exc:
_handle_get_simple_fail(link, exc)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
Expand All @@ -390,7 +359,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
except requests.Timeout:
_handle_get_simple_fail(link, "timed out")
else:
return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
return _make_index_content(resp)
return None


Expand Down Expand Up @@ -454,11 +423,14 @@ def create(
def find_links(self) -> List[str]:
return self.search_scope.find_links

def fetch_response(self, location: Link) -> Optional[IndexContent]:
def fetch_response(
self, location: Link, headers: Optional[Dict[str, str]] = None
) -> Optional[IndexContent]:
"""
Fetch an HTML page containing package links.
"""
return _get_index_content(location, session=self.session)
logger.debug("headers: %s", str(headers))
return _get_index_content(location, session=self.session, headers=headers)

def collect_sources(
self,
Expand All @@ -472,7 +444,6 @@ def collect_sources(
candidates_from_page=candidates_from_page,
page_validator=self.session.is_secure_origin,
expand_dir=False,
cache_link_parsing=False,
)
for loc in self.search_scope.get_index_urls_locations(project_name)
).values()
Expand All @@ -482,7 +453,6 @@ def collect_sources(
candidates_from_page=candidates_from_page,
page_validator=self.session.is_secure_origin,
expand_dir=True,
cache_link_parsing=True,
)
for loc in self.find_links
).values()
Expand Down
Loading

0 comments on commit b658989

Please sign in to comment.