squashed commit

- add FetchResolveCache - pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - remove cache_link_parsing - introduce persistent cache for link parsing - cache link evaluation when possible (further speedup, somehow) - handle --no-cache-dir
pypa · Sep 3, 2023 · b658989 · b658989
1 parent 04d4030
commit b658989
Show file tree

Hide file tree

Showing 7 changed files with 497 additions and 113 deletions.
diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py
@@ -7,7 +7,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Type
 
 from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
 from pip._vendor.packaging.utils import canonicalize_name
@@ -41,7 +41,9 @@ def __init__(self, cache_dir: str) -> None:
         assert not cache_dir or os.path.isabs(cache_dir)
         self.cache_dir = cache_dir or None
 
-    def _get_cache_path_parts(self, link: Link) -> List[str]:
+    def _get_cache_path_parts(
+        self, link: Link, *, interpreter_dependent: bool
+    ) -> List[str]:
         """Get parts of part that must be os.path.joined with cache_dir"""
 
         # We want to generate an url to use as our cache key, we don't want to
@@ -53,13 +55,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
         if link.subdirectory_fragment:
             key_parts["subdirectory"] = link.subdirectory_fragment
 
-        # Include interpreter name, major and minor version in cache key
-        # to cope with ill-behaved sdists that build a different wheel
-        # depending on the python version their setup.py is being run on,
-        # and don't encode the difference in compatibility tags.
-        # https://github.com/pypa/pip/issues/7296
-        key_parts["interpreter_name"] = interpreter_name()
-        key_parts["interpreter_version"] = interpreter_version()
+        if interpreter_dependent:
+            # Include interpreter name, major and minor version in cache key
+            # to cope with ill-behaved sdists that build a different wheel
+            # depending on the python version their setup.py is being run on,
+            # and don't encode the difference in compatibility tags.
+            # https://github.com/pypa/pip/issues/7296
+            key_parts["interpreter_name"] = interpreter_name()
+            key_parts["interpreter_version"] = interpreter_version()
 
         # Encode our key url with sha224, we'll use this because it has similar
         # security properties to sha256, but with a shorter total output (and
@@ -87,26 +90,47 @@ class LinkMetadataCache(Cache):
     """Persistently store the metadata of dists found at each link."""
 
     def get_path_for_link(self, link: Link) -> str:
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         return os.path.join(self.cache_dir, "link-metadata", *parts)
 
 
-class WheelCacheBase(Cache):
-    """Specializations to the cache concept for wheels."""
+class SerializableEntry(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def suffix(cls) -> str:
+        ...
 
     @abc.abstractmethod
-    def get(
-        self,
-        link: Link,
-        package_name: Optional[str],
-        supported_tags: List[Tag],
-    ) -> Link:
-        """Returns a link to a cached item if it exists, otherwise returns the
-        passed link.
-        """
+    def serialize(self) -> Dict[str, Any]:
         ...
 
+
+class FetchResolveCache(Cache):
+    def get_path_for_link(self, link: Link) -> str:
+        # We are reading index links to extract other links from, not executing any
+        # python code, so these caches are interpreter-independent.
+        parts = self._get_cache_path_parts(link, interpreter_dependent=False)
+        assert self.cache_dir
+        return os.path.join(self.cache_dir, "fetch-resolve", *parts)
+
+    def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
+        hashed = _hash_dict(entry.serialize())
+        return self.cache_path(link) / f"{hashed}{entry.suffix()}"
+
+    def clear_hashed_entries(
+        self, link: Link, entry_type: Type[SerializableEntry]
+    ) -> None:
+        for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
+            logger.debug(
+                "unlinking invalidated hashed link eval cache entry %s", hashed_entry
+            )
+            hashed_entry.unlink()
+
+
+class WheelCacheBase(Cache):
+    """Specializations to the cache concept for wheels."""
+
     def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
         can_not_cache = not self.cache_dir or not canonical_package_name or not link
         if can_not_cache:
@@ -119,6 +143,18 @@ def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
                 candidates.append((candidate, path))
         return candidates
 
+    @abc.abstractmethod
+    def get(
+        self,
+        link: Link,
+        package_name: Optional[str],
+        supported_tags: List[Tag],
+    ) -> Link:
+        """Returns a link to a cached item if it exists, otherwise returns the
+        passed link.
+        """
+        ...
+
 
 class SimpleWheelCache(WheelCacheBase):
     """A cache of wheels for future installs."""
@@ -141,7 +177,7 @@ def get_path_for_link(self, link: Link) -> str:
 
         :param link: The link of the sdist for which this will cache wheels.
         """
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         # Store wheels within the root cache_dir
         return os.path.join(self.cache_dir, "wheels", *parts)

diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py
@@ -12,7 +12,7 @@
 from optparse import Values
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 
-from pip._internal.cache import LinkMetadataCache, WheelCache
+from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
 from pip._internal.cli import cmdoptions
 from pip._internal.cli.base_command import Command
 from pip._internal.cli.command_context import CommandContextMixIn
@@ -506,8 +506,13 @@ def _build_package_finder(
             ignore_requires_python=ignore_requires_python,
         )
 
+        if options.cache_dir:
+            fetch_resolve_cache = FetchResolveCache(options.cache_dir)
+        else:
+            fetch_resolve_cache = None
         return PackageFinder.create(
             link_collector=link_collector,
             selection_prefs=selection_prefs,
             target_python=target_python,
+            fetch_resolve_cache=fetch_resolve_cache,
         )
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -4,7 +4,6 @@
 
 import collections
 import email.message
-import functools
 import itertools
 import json
 import logging
@@ -96,7 +95,9 @@ class _NotHTTP(Exception):
     pass
 
 
-def _ensure_api_response(url: str, session: PipSession) -> None:
+def _ensure_api_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> None:
     """
     Send a HEAD request to the URL, and ensure the response contains a simple
     API Response.
@@ -108,13 +109,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
     if scheme not in {"http", "https"}:
         raise _NotHTTP()
 
-    resp = session.head(url, allow_redirects=True)
+    resp = session.head(url, allow_redirects=True, headers=headers)
     raise_for_status(resp)
 
     _ensure_api_header(resp)
 
 
-def _get_simple_response(url: str, session: PipSession) -> Response:
+def _get_simple_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Response:
     """Access an Simple API response with GET, and return the response.
 
     This consists of three parts:
@@ -128,10 +131,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
        and raise `_NotAPIContent` otherwise.
     """
     if is_archive_file(Link(url).filename):
-        _ensure_api_response(url, session=session)
+        _ensure_api_response(url, session=session, headers=headers)
 
     logger.debug("Getting page %s", redact_auth_from_url(url))
 
+    logger.debug("headers: %s", str(headers))
+    if headers is None:
+        headers = {}
     resp = session.get(
         url,
         headers={
@@ -156,6 +162,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
             # once per 10 minutes.
             # For more information, please see pypa/pip#5670.
             "Cache-Control": "max-age=0",
+            **headers,
         },
     )
     raise_for_status(resp)
@@ -188,44 +195,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
     return None
 
 
-class CacheablePageContent:
-    def __init__(self, page: "IndexContent") -> None:
-        assert page.cache_link_parsing
-        self.page = page
-
-    def __eq__(self, other: object) -> bool:
-        return isinstance(other, type(self)) and self.page.url == other.page.url
-
-    def __hash__(self) -> int:
-        return hash(self.page.url)
-
-
-class ParseLinks(Protocol):
-    def __call__(self, page: "IndexContent") -> Iterable[Link]:
-        ...
-
-
-def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
-    """
-    Given a function that parses an Iterable[Link] from an IndexContent, cache the
-    function's result (keyed by CacheablePageContent), unless the IndexContent
-    `page` has `page.cache_link_parsing == False`.
-    """
-
-    @functools.lru_cache(maxsize=None)
-    def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
-        return list(fn(cacheable_page.page))
-
-    @functools.wraps(fn)
-    def wrapper_wrapper(page: "IndexContent") -> List[Link]:
-        if page.cache_link_parsing:
-            return wrapper(CacheablePageContent(page))
-        return list(fn(page))
-
-    return wrapper_wrapper
-
-
-@with_cached_index_content
 def parse_links(page: "IndexContent") -> Iterable[Link]:
     """
     Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
@@ -235,7 +204,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
         data = json.loads(page.content)
         for file in data.get("files", []):
-            link = Link.from_json(file, page.url)
+            link = Link.from_json(file, page.url, page_content=page)
             if link is None:
                 continue
             yield link
@@ -248,7 +217,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     url = page.url
     base_url = parser.base_url or url
     for anchor in parser.anchors:
-        link = Link.from_element(anchor, page_url=url, base_url=base_url)
+        link = Link.from_element(
+            anchor, page_url=url, base_url=base_url, page_content=page
+        )
         if link is None:
             continue
         yield link
@@ -263,20 +234,19 @@ def __init__(
         content_type: str,
         encoding: Optional[str],
         url: str,
-        cache_link_parsing: bool = True,
+        etag: Optional[str] = None,
+        date: Optional[str] = None,
     ) -> None:
         """
         :param encoding: the encoding to decode the given content.
         :param url: the URL from which the HTML was downloaded.
-        :param cache_link_parsing: whether links parsed from this page's url
-                                   should be cached. PyPI index urls should
-                                   have this set to False, for example.
         """
         self.content = content
         self.content_type = content_type
         self.encoding = encoding
         self.url = url
-        self.cache_link_parsing = cache_link_parsing
+        self.etag = etag
+        self.date = date
 
     def __str__(self) -> str:
         return redact_auth_from_url(self.url)
@@ -320,21 +290,22 @@ def _handle_get_simple_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_index_content(
-    response: Response, cache_link_parsing: bool = True
-) -> IndexContent:
+def _make_index_content(response: Response) -> IndexContent:
     encoding = _get_encoding_from_headers(response.headers)
     return IndexContent(
         response.content,
         response.headers["Content-Type"],
         encoding=encoding,
         url=response.url,
-        cache_link_parsing=cache_link_parsing,
+        etag=response.headers.get("ETag", None),
+        date=response.headers.get("Date", None),
     )
 
 
-def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
-    url = link.url.split("#", 1)[0]
+def _get_index_content(
+    link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Optional["IndexContent"]:
+    url = link.url_without_fragment
 
     # Check for VCS schemes that do not support lookup as web pages.
     vcs_scheme = _match_vcs_scheme(url)
@@ -361,7 +332,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
         logger.debug(" file: URL is directory, getting %s", url)
 
     try:
-        resp = _get_simple_response(url, session=session)
+        resp = _get_simple_response(url, session=session, headers=headers)
     except _NotHTTP:
         logger.warning(
             "Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +348,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
             exc.request_desc,
             exc.content_type,
         )
-    except NetworkConnectionError as exc:
-        _handle_get_simple_fail(link, exc)
-    except RetryError as exc:
+    except (NetworkConnectionError, RetryError) as exc:
         _handle_get_simple_fail(link, exc)
     except SSLError as exc:
         reason = "There was a problem confirming the ssl certificate: "
@@ -390,7 +359,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
     except requests.Timeout:
         _handle_get_simple_fail(link, "timed out")
     else:
-        return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
+        return _make_index_content(resp)
     return None
 
 
@@ -454,11 +423,14 @@ def create(
     def find_links(self) -> List[str]:
         return self.search_scope.find_links
 
-    def fetch_response(self, location: Link) -> Optional[IndexContent]:
+    def fetch_response(
+        self, location: Link, headers: Optional[Dict[str, str]] = None
+    ) -> Optional[IndexContent]:
         """
         Fetch an HTML page containing package links.
         """
-        return _get_index_content(location, session=self.session)
+        logger.debug("headers: %s", str(headers))
+        return _get_index_content(location, session=self.session, headers=headers)
 
     def collect_sources(
         self,
@@ -472,7 +444,6 @@ def collect_sources(
                 candidates_from_page=candidates_from_page,
                 page_validator=self.session.is_secure_origin,
                 expand_dir=False,
-                cache_link_parsing=False,
             )
             for loc in self.search_scope.get_index_urls_locations(project_name)
         ).values()
@@ -482,7 +453,6 @@ def collect_sources(
                 candidates_from_page=candidates_from_page,
                 page_validator=self.session.is_secure_origin,
                 expand_dir=True,
-                cache_link_parsing=True,
             )
             for loc in self.find_links
         ).values()