Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
# Changelog

## 2.4.8

### Fixed: retry transient full-scan upload failures

- The full-scan upload (`POST /orgs/<org>/full-scans`) now retries transient
gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and
request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus
jitter). Such failures are intermittent and a retried upload almost always succeeds.
In these failure modes the server never finished reading the request body, so no scan
was created and a retry does not duplicate one; in the rare case where a gateway
timeout races a request the server later completes, the extra scan is benign and
superseded by the retried one (as if the CLI had run twice).
Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each
retry logs a warning explaining what failed and when the next attempt happens.
- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions
it raises and owns the transient-vs-deterministic classification
(`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of
exception message text.

## 2.4.7

### Changed: pin @coana-tech/cli version; auto-update is now opt-in
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"

[project]
name = "socketsecurity"
version = "2.4.7"
version = "2.4.8"
requires-python = ">= 3.11"
license = {"file" = "LICENSE"}
dependencies = [
Expand All @@ -16,7 +16,7 @@ dependencies = [
'GitPython',
'packaging',
'python-dotenv',
"socketdev>=3.2.1,<4.0.0",
"socketdev>=3.3.0,<4.0.0",
"bs4>=0.0.2",
"markdown>=3.10",
"brotli>=1.0.9; platform_python_implementation == 'CPython'",
Expand Down
2 changes: 1 addition & 1 deletion socketsecurity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__author__ = 'socket.dev'
__version__ = '2.4.7'
__version__ = '2.4.8'
USER_AGENT = f'SocketPythonCLI/{__version__}'
44 changes: 43 additions & 1 deletion socketsecurity/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import random
import re
import sys
import tarfile
Expand Down Expand Up @@ -76,6 +77,21 @@
TIER1_FINALIZE_MAX_ATTEMPTS = 3
TIER1_FINALIZE_BACKOFF_SECONDS = 1.0

# Full scan upload retry policy. An upload can fail transiently at the gateway/connection
# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout)
# without the server having created the scan; the SDK classifies those failures via
# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was
# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway
# timeout races a request the server later completes; that is benign - the retried scan
# supersedes the orphaned one, same as running the CLI twice.)
#
# Each schedule entry is the wait before the next attempt once the current one fails (plus
# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in
# lock-step); the final None means the last attempt's failure is re-raised, not retried.
FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None)
FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)
FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0


def _humanize_alert_type(alert_type: str) -> str:
"""Convert a camelCase/PascalCase alert type into a Title-Cased label.
Expand Down Expand Up @@ -787,7 +803,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
# facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files)
try:
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
# Retry transient gateway/timeout failures (502/503/504/408, dropped connections,
# timeouts) with increasing waits. In these failure modes the server never finished
# reading the request body, so no scan was created and a retry does not duplicate
# one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS).
# fullscans.post() rebuilds its lazy file loaders from the plain paths in
# upload_files on every call, so simply calling it again per attempt is safe. The
# loop must stay inside this try so the temp .br files (cleaned up in the finally
# below) outlive every attempt.
for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1):
try:
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
break
except APIFailure as error:
if backoff_seconds is None or not error.is_transient_error():
raise
wait_seconds = backoff_seconds + random.uniform(
0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS
)
# SDK error messages can span many lines (path + response headers); the
# first line carries the status, which is all the warning needs.
error_summary = str(error).strip().splitlines()[0] if str(error).strip() else ""
log.warning(
f"Full scan upload failed with {type(error).__name__}({error_summary}), "
f"retrying in {wait_seconds:.0f}s "
f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})"
)
time.sleep(wait_seconds)
finally:
for temp_file in compressed_temp_files:
try:
Expand Down
285 changes: 285 additions & 0 deletions tests/unit/test_full_scan_retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
"""Tests for the full-scan upload retry on transient gateway/connection failures.

A `POST /orgs/<org>/full-scans` upload can fail transiently (an HTTP 502/503/504/408, a
dropped or reset connection, or a timeout) without the server having created the scan.
`Core.create_full_scan` retries the failures the SDK classifies as transient
(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry
decision, the loop bounds, and that the temporary brotli-compressed facts files survive
until every attempt has finished.
"""

import logging
from unittest.mock import MagicMock

import pytest
from socketdev.exceptions import (
APIAccessDenied,
APIBadGateway,
APIConnectionError,
APIFailure,
APIResourceNotFound,
APITimeout,
)
from socketdev.fullscans import FullScanMetadata

from socketsecurity.core import (
FULL_SCAN_UPLOAD_MAX_ATTEMPTS,
SOCKET_FACTS_BROTLI_FILENAME,
SOCKET_FACTS_FILENAME,
Core,
)


def _success_response():
metadata = FullScanMetadata(
id="scan-1",
created_at="2026-01-01T00:00:00Z",
updated_at="2026-01-01T00:00:00Z",
organization_id="org-1",
repository_id="repo-1",
branch="main",
html_report_url="https://socket.dev/report",
)
response = MagicMock()
response.success = True
response.data = metadata
return response


# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class
# (socketdev/core/api.py); the recorded status_code drives is_transient_error().
def _catch_all_failure(status_code: int) -> APIFailure:
return APIFailure(
f"Bad Request: HTTP original_status_code:{status_code}\n"
f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n"
f"Headers:\ncf-ray: abc123",
status_code=status_code,
)


@pytest.fixture
def core_with_mock_sdk():
# Build a Core without running org setup; we only exercise create_full_scan.
core = Core.__new__(Core)
core.sdk = MagicMock()
core.cli_config = None # skip the tier1 finalize branch
return core


@pytest.fixture(autouse=True)
def no_sleep(mocker):
return mocker.patch("socketsecurity.core.time.sleep")


def test_upload_succeeds_first_try(core_with_mock_sdk, tmp_path, no_sleep):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.return_value = _success_response()

full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert full_scan.id == "scan-1"
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
no_sleep.assert_not_called()


def test_upload_retries_on_502_then_succeeds(
core_with_mock_sdk, tmp_path, no_sleep, caplog
):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
APIBadGateway(),
APIBadGateway(),
_success_response(),
]

with caplog.at_level(logging.WARNING, logger="socketdev"):
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert full_scan.id == "scan-1"
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 3
assert no_sleep.call_count == 2 # waits before attempts 2 and 3
retry_warnings = [r for r in caplog.records if "retrying in" in r.message]
assert len(retry_warnings) == 2
assert "APIBadGateway" in retry_warnings[0].message
assert f"(attempt 2/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" in retry_warnings[0].message


def test_upload_raises_after_exhausting_attempts(
core_with_mock_sdk, tmp_path, no_sleep
):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway()

with pytest.raises(APIBadGateway):
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert (
core_with_mock_sdk.sdk.fullscans.post.call_count
== FULL_SCAN_UPLOAD_MAX_ATTEMPTS
)


@pytest.mark.parametrize("status_code", [408, 503, 504])
def test_upload_retries_on_catch_all_transient_statuses(
core_with_mock_sdk, tmp_path, no_sleep, status_code
):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
_catch_all_failure(status_code),
_success_response(),
]

full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert full_scan.id == "scan-1"
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2


@pytest.mark.parametrize("error_class", [APIConnectionError, APITimeout])
def test_upload_retries_on_connection_level_errors(
core_with_mock_sdk, tmp_path, no_sleep, error_class
):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
error_class(),
_success_response(),
]

full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert full_scan.id == "scan-1"
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2


def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = _catch_all_failure(400)

with pytest.raises(APIFailure):
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
no_sleep.assert_not_called()


@pytest.mark.parametrize(
"error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)]
)
def test_upload_does_not_retry_on_dedicated_4xx_classes(
core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code
):
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class(
status_code=status_code
)

with pytest.raises(error_class):
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
no_sleep.assert_not_called()


def test_upload_does_not_retry_on_error_payload(core_with_mock_sdk, tmp_path, no_sleep):
# A response that came back but reports failure (res.success False) is not transient.
manifest = tmp_path / "package.json"
manifest.write_text("{}")
failed = MagicMock()
failed.success = False
failed.message = "tarball too large"
failed.status = 200
core_with_mock_sdk.sdk.fullscans.post.return_value = failed

with pytest.raises(Exception, match="tarball too large"):
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
no_sleep.assert_not_called()


def test_temp_br_file_survives_retries_and_is_cleaned_after(
core_with_mock_sdk, tmp_path, no_sleep
):
# The brotli-compressed facts sibling must stay on disk across every retry attempt
# (the SDK re-reads it per call) and only be deleted once all attempts finished.
facts = tmp_path / SOCKET_FACTS_FILENAME
facts.write_text('{"components": []}')
compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME
br_present_per_attempt = []

def post_side_effect(upload_files, *args, **kwargs):
br_present_per_attempt.append(compressed.is_file())
assert str(compressed) in upload_files
if len(br_present_per_attempt) < 3:
raise APIBadGateway()
return _success_response()

core_with_mock_sdk.sdk.fullscans.post.side_effect = post_side_effect

full_scan = core_with_mock_sdk.create_full_scan([str(facts)], MagicMock())

assert full_scan.id == "scan-1"
assert br_present_per_attempt == [True, True, True]
assert not compressed.exists() # cleaned up after the attempts finished
assert facts.is_file() # the original facts file is never touched


def test_temp_br_file_cleaned_after_exhausted_retries(
core_with_mock_sdk, tmp_path, no_sleep
):
facts = tmp_path / SOCKET_FACTS_FILENAME
facts.write_text('{"components": []}')
compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME
core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway()

with pytest.raises(APIBadGateway):
core_with_mock_sdk.create_full_scan([str(facts)], MagicMock())

assert (
core_with_mock_sdk.sdk.fullscans.post.call_count
== FULL_SCAN_UPLOAD_MAX_ATTEMPTS
)
assert not compressed.exists()


class _StubFailure(APIFailure):
"""An APIFailure whose transience is fixed, regardless of class or status code."""

def __init__(self, transient: bool):
super().__init__("stub failure")
self._transient = transient

def is_transient_error(self) -> bool:
return self._transient


@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)])
def test_retry_decision_delegates_to_sdk_classification(
core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls
):
# The CLI encodes no knowledge of the SDK's exception hierarchy or status codes:
# the retry decision is exactly APIFailure.is_transient_error(). (The transient /
# non-transient truth table itself is tested in the SDK, next to the code that
# raises the exceptions.)
manifest = tmp_path / "package.json"
manifest.write_text("{}")
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
_StubFailure(transient),
_success_response(),
]

if transient:
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
assert full_scan.id == "scan-1"
else:
with pytest.raises(_StubFailure):
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())

assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls
Loading
Loading