Skip to content

Commit 8d739b7

Browse files
committed
ENH: respect/log separately Retry-After + support DANDI_DEVEL_AGGRESSIVE_RETRY mode
This is all to address that odd case with 000026 where connection keeps interrupting. Unclear why so adding more specific cases handling and allowing for such an aggressive retrying where we would proceed as long as we are getting something (but sleep would also increase)
1 parent 9c47a47 commit 8d739b7

File tree

1 file changed

+73
-15
lines changed

1 file changed

+73
-15
lines changed

dandi/download.py

+73-15
Original file line numberDiff line numberDiff line change
@@ -693,14 +693,18 @@ def _download_file(
693693
# TODO: how do we discover the total size????
694694
# TODO: do not do it in-place, but rather into some "hidden" file
695695
resuming = False
696-
for attempt in range(3):
696+
attempt = 0
697+
nattempts = 3 # number to do, could be incremented if we downloaded a little
698+
while attempt <= nattempts:
699+
attempt += 1
697700
try:
698701
if digester:
699702
downloaded_digest = digester() # start empty
700703
warned = False
701704
# I wonder if we could make writing async with downloader
702705
with DownloadDirectory(path, digests or {}) as dldir:
703706
assert dldir.offset is not None
707+
downloaded_in_attempt = 0
704708
downloaded = dldir.offset
705709
resuming = downloaded > 0
706710
if size is not None and downloaded == size:
@@ -719,6 +723,7 @@ def _download_file(
719723
assert downloaded_digest is not None
720724
downloaded_digest.update(block)
721725
downloaded += len(block)
726+
downloaded_in_attempt += len(block)
722727
# TODO: yield progress etc
723728
out: dict[str, Any] = {"done": downloaded}
724729
if size:
@@ -744,30 +749,83 @@ def _download_file(
744749
# Catching RequestException lets us retry on timeout & connection
745750
# errors (among others) in addition to HTTP status errors.
746751
except requests.RequestException as exc:
752+
sleep_amount = random.random() * 5 * attempt
753+
if os.environ.get("DANDI_DEVEL_AGGRESSIVE_RETRY"):
754+
# in such a case if we downloaded a little more --
755+
# consider it a successful attempt
756+
if downloaded_in_attempt > 0:
757+
lgr.debug(
758+
"%s - download failed on attempt #%d: %s, "
759+
"but did download %d bytes, so considering "
760+
"it a success and incrementing number of allowed attempts.",
761+
path,
762+
attempt,
763+
exc,
764+
downloaded_in_attempt,
765+
)
766+
nattempts += 1
747767
# TODO: actually we should probably retry only on selected codes,
748-
# and also respect Retry-After
749-
if attempt >= 2 or (
750-
exc.response is not None
751-
and exc.response.status_code
752-
not in (
768+
if exc.response is not None:
769+
if exc.response.status_code not in (
753770
400, # Bad Request, but happened with gider:
754771
# https://github.com/dandi/dandi-cli/issues/87
755772
*RETRY_STATUSES,
773+
):
774+
lgr.debug(
775+
"%s - download failed due to response %d: %s",
776+
path,
777+
exc.response.status_code,
778+
exc,
779+
)
780+
yield {"status": "error", "message": str(exc)}
781+
return
782+
elif retry_after := exc.response.headers.get("Retry-After"):
783+
# playing safe
784+
if not str(retry_after).isdigit():
785+
# our code is wrong, do not crash but issue warning so
786+
# we might get report/fix it up
787+
lgr.warning(
788+
"%s - download failed due to response %d with non-integer"
789+
" Retry-After=%r: %s",
790+
path,
791+
exc.response.status_code,
792+
retry_after,
793+
exc,
794+
)
795+
yield {"status": "error", "message": str(exc)}
796+
return
797+
sleep_amount = int(retry_after)
798+
lgr.debug(
799+
"%s - download failed due to response %d with "
800+
"Retry-After=%d: %s, will sleep and retry",
801+
path,
802+
exc.response.status_code,
803+
sleep_amount,
804+
exc,
805+
)
806+
else:
807+
lgr.debug("%s - download failed: %s", path, exc)
808+
yield {"status": "error", "message": str(exc)}
809+
return
810+
elif attempt >= nattempts:
811+
lgr.debug(
812+
"%s - download failed after %d attempts: %s", path, attempt, exc
756813
)
757-
):
758-
lgr.debug("%s - download failed: %s", path, exc)
759814
yield {"status": "error", "message": str(exc)}
760815
return
761816
# if is_access_denied(exc) or attempt >= 2:
762817
# raise
763818
# sleep a little and retry
764-
lgr.debug(
765-
"%s - failed to download on attempt #%d: %s, will sleep a bit and retry",
766-
path,
767-
attempt,
768-
exc,
769-
)
770-
time.sleep(random.random() * 5)
819+
else:
820+
lgr.debug(
821+
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
822+
path,
823+
attempt,
824+
exc,
825+
)
826+
time.sleep(sleep_amount)
827+
else:
828+
lgr.warning("downloader logic: We should not be here!")
771829

772830
if downloaded_digest and not resuming:
773831
assert downloaded_digest is not None

0 commit comments

Comments
 (0)