Handle missing devices and relax agent auth retries

This commit is contained in:
2025-10-18 02:52:15 -06:00
parent ebf0fe9236
commit 775d365512
3 changed files with 173 additions and 20 deletions

View File

@@ -23,7 +23,7 @@ import ssl
import threading
import contextlib
import errno
from typing import Any, Dict, Optional, List, Callable
from typing import Any, Dict, Optional, List, Callable, Tuple
import requests
try:
@@ -1007,10 +1007,22 @@ class AgentHttpClient:
timeout=20,
)
if resp.status_code in (401, 403):
_log_agent("Refresh token rejected; re-enrolling", fname="agent.error.log")
self._clear_tokens_locked()
self._perform_enrollment_locked()
return
error_code, snippet = self._error_details(resp)
if resp.status_code == 401 and self._should_retry_auth(resp.status_code, error_code):
_log_agent(
"Refresh token rejected; attempting re-enrollment"
f" error={error_code or '<unknown>'}",
fname="agent.error.log",
)
self._clear_tokens_locked()
self._perform_enrollment_locked()
return
_log_agent(
"Refresh token request forbidden "
f"status={resp.status_code} error={error_code or '<unknown>'}"
f" body_snippet={snippet}",
fname="agent.error.log",
)
resp.raise_for_status()
data = resp.json()
access_token = data.get("access_token")
@@ -1036,6 +1048,33 @@ class AgentHttpClient:
self.guid = self.key_store.load_guid()
self.session.headers.pop("Authorization", None)
def _error_details(self, response: requests.Response) -> Tuple[Optional[str], str]:
error_code: Optional[str] = None
snippet = ""
try:
snippet = response.text[:256]
except Exception:
snippet = "<unavailable>"
try:
data = response.json()
except Exception:
data = None
if isinstance(data, dict):
for key in ("error", "code", "status"):
value = data.get(key)
if isinstance(value, str) and value.strip():
error_code = value.strip()
break
return error_code, snippet
def _should_retry_auth(self, status_code: int, error_code: Optional[str]) -> bool:
if status_code == 401:
return True
retryable_forbidden = {"fingerprint_mismatch"}
if status_code == 403 and error_code in retryable_forbidden:
return True
return False
def _resolve_installer_code(self) -> str:
if INSTALLER_CODE_OVERRIDE:
return INSTALLER_CODE_OVERRIDE
@@ -1068,20 +1107,19 @@ class AgentHttpClient:
headers = self.auth_headers()
response = self.session.post(url, json=payload, headers=headers, timeout=30)
if response.status_code in (401, 403) and require_auth:
snippet = ""
try:
snippet = response.text[:256]
except Exception:
snippet = "<unavailable>"
_log_agent(
"Authenticated request rejected "
f"path={path} status={response.status_code} body_snippet={snippet}",
fname="agent.error.log",
)
self.clear_tokens()
self.ensure_authenticated()
headers = self.auth_headers()
response = self.session.post(url, json=payload, headers=headers, timeout=30)
error_code, snippet = self._error_details(response)
if self._should_retry_auth(response.status_code, error_code):
self.clear_tokens()
self.ensure_authenticated()
headers = self.auth_headers()
response = self.session.post(url, json=payload, headers=headers, timeout=30)
else:
_log_agent(
"Authenticated request rejected "
f"path={path} status={response.status_code} error={error_code or '<unknown>'}"
f" body_snippet={snippet}",
fname="agent.error.log",
)
response.raise_for_status()
if response.headers.get("Content-Type", "").lower().startswith("application/json"):
return response.json()

View File

@@ -1,7 +1,10 @@
from __future__ import annotations
import functools
import sqlite3
import time
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Callable, Dict, Optional
import jwt
@@ -98,6 +101,9 @@ class DeviceAuthManager:
(guid,),
)
row = cur.fetchone()
if not row:
row = self._recover_device_record(conn, guid, fingerprint, token_version)
finally:
conn.close()
@@ -147,6 +153,102 @@ class DeviceAuthManager:
)
return ctx
def _recover_device_record(
self,
conn: sqlite3.Connection,
guid: str,
fingerprint: str,
token_version: int,
) -> Optional[tuple]:
"""Attempt to recreate a missing device row for an authenticated token."""
guid = (guid or "").strip()
fingerprint = (fingerprint or "").strip()
if not guid or not fingerprint:
return None
cur = conn.cursor()
now_ts = int(time.time())
try:
now_iso = datetime.now(tz=timezone.utc).isoformat()
except Exception:
now_iso = datetime.utcnow().isoformat() # pragma: no cover
base_hostname = f"RECOVERED-{guid[:12].upper()}" if guid else "RECOVERED"
for attempt in range(6):
hostname = base_hostname if attempt == 0 else f"{base_hostname}-{attempt}"
try:
cur.execute(
"""
INSERT INTO devices (
guid,
hostname,
created_at,
last_seen,
ssl_key_fingerprint,
token_version,
status,
key_added_at
)
VALUES (?, ?, ?, ?, ?, ?, 'active', ?)
""",
(
guid,
hostname,
now_ts,
now_ts,
fingerprint,
max(token_version or 1, 1),
now_iso,
),
)
except sqlite3.IntegrityError as exc:
# Hostname collision try again with a suffixed placeholder.
message = str(exc).lower()
if "hostname" in message and "unique" in message:
continue
self._log(
"server",
f"device auth failed to recover guid={guid} due to integrity error: {exc}",
)
conn.rollback()
return None
except Exception as exc: # pragma: no cover - defensive logging
self._log(
"server",
f"device auth unexpected error recovering guid={guid}: {exc}",
)
conn.rollback()
return None
else:
conn.commit()
break
else:
# Exhausted attempts because of hostname collisions.
self._log(
"server",
f"device auth could not recover guid={guid}; hostname collisions persisted",
)
conn.rollback()
return None
cur.execute(
"""
SELECT guid, ssl_key_fingerprint, token_version, status
FROM devices
WHERE guid = ?
""",
(guid,),
)
row = cur.fetchone()
if not row:
self._log(
"server",
f"device auth recovery for guid={guid} committed but row still missing",
)
return row
def require_device_auth(manager: DeviceAuthManager):
def decorator(func):

View File

@@ -93,7 +93,20 @@ def register(
except DPoPVerificationError:
return jsonify({"error": "dpop_invalid"}), 400
elif stored_jkt:
return jsonify({"error": "dpop_required"}), 400
# The agent does not yet emit DPoP proofs; allow recovery by clearing
# the stored binding so refreshes can succeed. This preserves
# backward compatibility while the client gains full DPoP support.
try:
app.logger.warning(
"Clearing stored DPoP binding for guid=%s due to missing proof",
guid,
)
except Exception:
pass
cur.execute(
"UPDATE refresh_tokens SET dpop_jkt = NULL WHERE id = ?",
(record_id,),
)
new_access_token = jwt_service.issue_access_token(
guid,