make FetchResolveCache

- pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS - remove quotes from etag and use binary checksum to save a few bytes - parse http modified date to compress the cached representation
pypa · Aug 13, 2024 · f28ecfd · f28ecfd
1 parent b02915a
commit f28ecfd
Show file tree

Hide file tree

Showing 7 changed files with 316 additions and 36 deletions.
diff --git a/news/12257.feature.rst b/news/12257.feature.rst
@@ -0,0 +1 @@
+Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.
diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py
@@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
         assert not cache_dir or os.path.isabs(cache_dir)
         self.cache_dir = cache_dir or None
 
-    def _get_cache_path_parts(self, link: Link) -> List[str]:
+    def _get_cache_path_parts(
+        self, link: Link, *, interpreter_dependent: bool
+    ) -> List[str]:
         """Get parts of part that must be os.path.joined with cache_dir"""
 
         # We want to generate an url to use as our cache key, we don't want to
@@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
         if link.subdirectory_fragment:
             key_parts["subdirectory"] = link.subdirectory_fragment
 
-        # Include interpreter name, major and minor version in cache key
-        # to cope with ill-behaved sdists that build a different wheel
-        # depending on the python version their setup.py is being run on,
-        # and don't encode the difference in compatibility tags.
-        # https://github.com/pypa/pip/issues/7296
-        key_parts["interpreter_name"] = interpreter_name()
-        key_parts["interpreter_version"] = interpreter_version()
+        if interpreter_dependent:
+            # Include interpreter name, major and minor version in cache key
+            # to cope with ill-behaved sdists that build a different wheel
+            # depending on the python version their setup.py is being run on,
+            # and don't encode the difference in compatibility tags.
+            # https://github.com/pypa/pip/issues/7296
+            key_parts["interpreter_name"] = interpreter_name()
+            key_parts["interpreter_version"] = interpreter_version()
 
         # Encode our key url with sha224, we'll use this because it has similar
         # security properties to sha256, but with a shorter total output (and
@@ -138,11 +141,20 @@ class LinkMetadataCache(Cache):
     """Persistently store the metadata of dists found at each link."""
 
     def get_path_for_link(self, link: Link) -> str:
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         return os.path.join(self.cache_dir, "link-metadata", *parts)
 
 
+class FetchResolveCache(Cache):
+    def get_path_for_link(self, link: Link) -> str:
+        # We are reading index links to extract other links from, not executing any
+        # python code, so these caches are interpreter-independent.
+        parts = self._get_cache_path_parts(link, interpreter_dependent=False)
+        assert self.cache_dir
+        return os.path.join(self.cache_dir, "fetch-resolve", *parts)
+
+
 class WheelCacheBase(Cache):
     """Specializations to the cache concept for wheels."""
 
@@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str:
 
         :param link: The link of the sdist for which this will cache wheels.
         """
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         # Store wheels within the root cache_dir
         return os.path.join(self.cache_dir, "wheels", *parts)

diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py
@@ -10,7 +10,7 @@
 from optparse import Values
 from typing import Any, List, Optional, Tuple
 
-from pip._internal.cache import LinkMetadataCache, WheelCache
+from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
 from pip._internal.cli import cmdoptions
 from pip._internal.cli.index_command import IndexGroupCommand
 from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
@@ -333,8 +333,13 @@ def _build_package_finder(
             ignore_requires_python=ignore_requires_python,
         )
 
+        if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
+            fetch_resolve_cache = FetchResolveCache(options.cache_dir)
+        else:
+            fetch_resolve_cache = None
         return PackageFinder.create(
             link_collector=link_collector,
             selection_prefs=selection_prefs,
             target_python=target_python,
+            fetch_resolve_cache=fetch_resolve_cache,
         )
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -92,7 +92,9 @@ class _NotHTTP(Exception):
     pass
 
 
-def _ensure_api_response(url: str, session: PipSession) -> None:
+def _ensure_api_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> None:
     """
     Send a HEAD request to the URL, and ensure the response contains a simple
     API Response.
@@ -104,13 +106,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
     if scheme not in {"http", "https"}:
         raise _NotHTTP()
 
-    resp = session.head(url, allow_redirects=True)
+    resp = session.head(url, allow_redirects=True, headers=headers)
     raise_for_status(resp)
 
     _ensure_api_header(resp)
 
 
-def _get_simple_response(url: str, session: PipSession) -> Response:
+def _get_simple_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Response:
     """Access an Simple API response with GET, and return the response.
 
     This consists of three parts:
@@ -124,10 +128,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
        and raise `_NotAPIContent` otherwise.
     """
     if is_archive_file(Link(url).filename):
-        _ensure_api_response(url, session=session)
+        _ensure_api_response(url, session=session, headers=headers)
 
     logger.debug("Getting page %s", redact_auth_from_url(url))
 
+    logger.debug("headers: %s", str(headers))
+    if headers is None:
+        headers = {}
     resp = session.get(
         url,
         headers={
@@ -152,6 +159,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
             # once per 10 minutes.
             # For more information, please see pypa/pip#5670.
             "Cache-Control": "max-age=0",
+            **headers,
         },
     )
     raise_for_status(resp)
@@ -230,7 +238,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
         data = json.loads(page.content)
         for file in data.get("files", []):
-            link = Link.from_json(file, page.url)
+            link = Link.from_json(file, page.url, page_content=page)
             if link is None:
                 continue
             yield link
@@ -243,7 +251,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     url = page.url
     base_url = parser.base_url or url
     for anchor in parser.anchors:
-        link = Link.from_element(anchor, page_url=url, base_url=base_url)
+        link = Link.from_element(
+            anchor, page_url=url, base_url=base_url, page_content=page
+        )
         if link is None:
             continue
         yield link
@@ -258,13 +268,17 @@ class IndexContent:
     :param cache_link_parsing: whether links parsed from this page's url
                                should be cached. PyPI index urls should
                                have this set to False, for example.
+    :param etag: The ``ETag`` header from an HTTP request against ``url``.
+    :param date: The ``Date`` header from an HTTP request against ``url``.
     """
 
     content: bytes
     content_type: str
     encoding: Optional[str]
     url: str
     cache_link_parsing: bool = True
+    etag: Optional[str] = None
+    date: Optional[str] = None
 
     def __str__(self) -> str:
         return redact_auth_from_url(self.url)
@@ -309,7 +323,8 @@ def _handle_get_simple_fail(
 
 
 def _make_index_content(
-    response: Response, cache_link_parsing: bool = True
+    response: Response,
+    cache_link_parsing: bool = True,
 ) -> IndexContent:
     encoding = _get_encoding_from_headers(response.headers)
     return IndexContent(
@@ -318,11 +333,15 @@ def _make_index_content(
         encoding=encoding,
         url=response.url,
         cache_link_parsing=cache_link_parsing,
+        etag=response.headers.get("ETag", None),
+        date=response.headers.get("Date", None),
     )
 
 
-def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
-    url = link.url.split("#", 1)[0]
+def _get_index_content(
+    link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Optional["IndexContent"]:
+    url = link.url_without_fragment
 
     # Check for VCS schemes that do not support lookup as web pages.
     vcs_scheme = _match_vcs_scheme(url)
@@ -349,7 +368,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
         logger.debug(" file: URL is directory, getting %s", url)
 
     try:
-        resp = _get_simple_response(url, session=session)
+        resp = _get_simple_response(url, session=session, headers=headers)
     except _NotHTTP:
         logger.warning(
             "Skipping page %s because it looks like an archive, and cannot "
@@ -365,9 +384,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
             exc.request_desc,
             exc.content_type,
         )
-    except NetworkConnectionError as exc:
-        _handle_get_simple_fail(link, exc)
-    except RetryError as exc:
+    except (NetworkConnectionError, RetryError) as exc:
         _handle_get_simple_fail(link, exc)
     except SSLError as exc:
         reason = "There was a problem confirming the ssl certificate: "
@@ -441,11 +458,14 @@ def create(
     def find_links(self) -> List[str]:
         return self.search_scope.find_links
 
-    def fetch_response(self, location: Link) -> Optional[IndexContent]:
+    def fetch_response(
+        self, location: Link, headers: Optional[Dict[str, str]] = None
+    ) -> Optional[IndexContent]:
         """
         Fetch an HTML page containing package links.
         """
-        return _get_index_content(location, session=self.session)
+        logger.debug("headers: %s", str(headers))
+        return _get_index_content(location, session=self.session, headers=headers)
 
     def collect_sources(
         self,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.