Skip to content

Commit

Permalink
make FetchResolveCache
Browse files Browse the repository at this point in the history
- pipe in headers arg
- provide full context in Link.comes_from
- pull in etag and date and cache the outputs
- handle --no-cache-dir
- add NEWS
- remove quotes from etag and use binary checksum to save a few bytes
- parse http modified date to compress the cached representation
  • Loading branch information
cosmicexplorer committed Aug 13, 2024
1 parent b02915a commit f28ecfd
Show file tree
Hide file tree
Showing 7 changed files with 316 additions and 36 deletions.
1 change: 1 addition & 0 deletions news/12257.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.
32 changes: 22 additions & 10 deletions src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
assert not cache_dir or os.path.isabs(cache_dir)
self.cache_dir = cache_dir or None

def _get_cache_path_parts(self, link: Link) -> List[str]:
def _get_cache_path_parts(
self, link: Link, *, interpreter_dependent: bool
) -> List[str]:
"""Get parts of part that must be os.path.joined with cache_dir"""

# We want to generate an url to use as our cache key, we don't want to
Expand All @@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
if link.subdirectory_fragment:
key_parts["subdirectory"] = link.subdirectory_fragment

# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()
if interpreter_dependent:
# Include interpreter name, major and minor version in cache key
# to cope with ill-behaved sdists that build a different wheel
# depending on the python version their setup.py is being run on,
# and don't encode the difference in compatibility tags.
# https://github.com/pypa/pip/issues/7296
key_parts["interpreter_name"] = interpreter_name()
key_parts["interpreter_version"] = interpreter_version()

# Encode our key url with sha224, we'll use this because it has similar
# security properties to sha256, but with a shorter total output (and
Expand Down Expand Up @@ -138,11 +141,20 @@ class LinkMetadataCache(Cache):
"""Persistently store the metadata of dists found at each link."""

def get_path_for_link(self, link: Link) -> str:
parts = self._get_cache_path_parts(link)
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
return os.path.join(self.cache_dir, "link-metadata", *parts)


class FetchResolveCache(Cache):
def get_path_for_link(self, link: Link) -> str:
# We are reading index links to extract other links from, not executing any
# python code, so these caches are interpreter-independent.
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
assert self.cache_dir
return os.path.join(self.cache_dir, "fetch-resolve", *parts)


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""

Expand Down Expand Up @@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str:
:param link: The link of the sdist for which this will cache wheels.
"""
parts = self._get_cache_path_parts(link)
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
assert self.cache_dir
# Store wheels within the root cache_dir
return os.path.join(self.cache_dir, "wheels", *parts)
Expand Down
7 changes: 6 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from optparse import Values
from typing import Any, List, Optional, Tuple

from pip._internal.cache import LinkMetadataCache, WheelCache
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
from pip._internal.cli import cmdoptions
from pip._internal.cli.index_command import IndexGroupCommand
from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
Expand Down Expand Up @@ -333,8 +333,13 @@ def _build_package_finder(
ignore_requires_python=ignore_requires_python,
)

if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
else:
fetch_resolve_cache = None
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
fetch_resolve_cache=fetch_resolve_cache,
)
50 changes: 35 additions & 15 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ class _NotHTTP(Exception):
pass


def _ensure_api_response(url: str, session: PipSession) -> None:
def _ensure_api_response(
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> None:
"""
Send a HEAD request to the URL, and ensure the response contains a simple
API Response.
Expand All @@ -104,13 +106,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
if scheme not in {"http", "https"}:
raise _NotHTTP()

resp = session.head(url, allow_redirects=True)
resp = session.head(url, allow_redirects=True, headers=headers)
raise_for_status(resp)

_ensure_api_header(resp)


def _get_simple_response(url: str, session: PipSession) -> Response:
def _get_simple_response(
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> Response:
"""Access an Simple API response with GET, and return the response.
This consists of three parts:
Expand All @@ -124,10 +128,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
and raise `_NotAPIContent` otherwise.
"""
if is_archive_file(Link(url).filename):
_ensure_api_response(url, session=session)
_ensure_api_response(url, session=session, headers=headers)

logger.debug("Getting page %s", redact_auth_from_url(url))

logger.debug("headers: %s", str(headers))
if headers is None:
headers = {}
resp = session.get(
url,
headers={
Expand All @@ -152,6 +159,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
**headers,
},
)
raise_for_status(resp)
Expand Down Expand Up @@ -230,7 +238,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
link = Link.from_json(file, page.url)
link = Link.from_json(file, page.url, page_content=page)
if link is None:
continue
yield link
Expand All @@ -243,7 +251,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = Link.from_element(anchor, page_url=url, base_url=base_url)
link = Link.from_element(
anchor, page_url=url, base_url=base_url, page_content=page
)
if link is None:
continue
yield link
Expand All @@ -258,13 +268,17 @@ class IndexContent:
:param cache_link_parsing: whether links parsed from this page's url
should be cached. PyPI index urls should
have this set to False, for example.
:param etag: The ``ETag`` header from an HTTP request against ``url``.
:param date: The ``Date`` header from an HTTP request against ``url``.
"""

content: bytes
content_type: str
encoding: Optional[str]
url: str
cache_link_parsing: bool = True
etag: Optional[str] = None
date: Optional[str] = None

def __str__(self) -> str:
return redact_auth_from_url(self.url)
Expand Down Expand Up @@ -309,7 +323,8 @@ def _handle_get_simple_fail(


def _make_index_content(
response: Response, cache_link_parsing: bool = True
response: Response,
cache_link_parsing: bool = True,
) -> IndexContent:
encoding = _get_encoding_from_headers(response.headers)
return IndexContent(
Expand All @@ -318,11 +333,15 @@ def _make_index_content(
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
etag=response.headers.get("ETag", None),
date=response.headers.get("Date", None),
)


def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
url = link.url.split("#", 1)[0]
def _get_index_content(
link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
) -> Optional["IndexContent"]:
url = link.url_without_fragment

# Check for VCS schemes that do not support lookup as web pages.
vcs_scheme = _match_vcs_scheme(url)
Expand All @@ -349,7 +368,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
logger.debug(" file: URL is directory, getting %s", url)

try:
resp = _get_simple_response(url, session=session)
resp = _get_simple_response(url, session=session, headers=headers)
except _NotHTTP:
logger.warning(
"Skipping page %s because it looks like an archive, and cannot "
Expand All @@ -365,9 +384,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
exc.request_desc,
exc.content_type,
)
except NetworkConnectionError as exc:
_handle_get_simple_fail(link, exc)
except RetryError as exc:
except (NetworkConnectionError, RetryError) as exc:
_handle_get_simple_fail(link, exc)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
Expand Down Expand Up @@ -441,11 +458,14 @@ def create(
def find_links(self) -> List[str]:
return self.search_scope.find_links

def fetch_response(self, location: Link) -> Optional[IndexContent]:
def fetch_response(
self, location: Link, headers: Optional[Dict[str, str]] = None
) -> Optional[IndexContent]:
"""
Fetch an HTML page containing package links.
"""
return _get_index_content(location, session=self.session)
logger.debug("headers: %s", str(headers))
return _get_index_content(location, session=self.session, headers=headers)

def collect_sources(
self,
Expand Down
Loading

0 comments on commit f28ecfd

Please sign in to comment.