diff --git a/Framework/deploy_handler/long_poll_handler.py b/Framework/deploy_handler/long_poll_handler.py index 7acc5e91..8f78267e 100644 --- a/Framework/deploy_handler/long_poll_handler.py +++ b/Framework/deploy_handler/long_poll_handler.py @@ -44,6 +44,26 @@ def __init__( self.backoff_time = 0 + def _current_server(self) -> str | None: + server = ( + ConfigModule.get_config_value("Authentication", "server_address") + .strip('"') + .strip() + ) + return server or None + + def _mark_deploy_connected(self) -> None: + STATE.connected_server = self._current_server() + STATE.target_server = None + STATE.connection_state = "connected" + STATE.last_connect_error = None + + def _mark_deploy_unavailable(self, message: str, failed: bool = False) -> None: + STATE.connected_server = None + STATE.target_server = self._current_server() + STATE.connection_state = "failed" if failed else "offline" + STATE.last_connect_error = message + async def on_message(self, message) -> bool: """Returns True if the handler should quit, False otherwise.""" if message == self.COMMAND_DONE: @@ -265,12 +285,20 @@ async def run(self, host: str) -> None: reconnect = True resp = await RequestFormatter.async_request("get", host, timeout=70) if resp is None: + self._mark_deploy_unavailable("Deploy service returned no response") break if resp.content.startswith(self.ERROR_PREFIX): + self._mark_deploy_unavailable( + resp.content.decode("utf-8", errors="replace"), + failed=True, + ) self.on_error(resp.content) continue + if resp.ok: + self._mark_deploy_connected() + if resp.ok and print_online: print_online = False node_id = CommonUtil.MachineInfo().getLocalUser().lower() @@ -280,6 +308,7 @@ async def run(self, host: str) -> None: continue if resp.status_code == httpx.codes.BAD_GATEWAY: + self._mark_deploy_unavailable("Deploy service returned 502 Bad Gateway") print_online = True print(Fore.YELLOW + "Server offline. Retrying after 30s") await asyncio.sleep(30) @@ -291,6 +320,9 @@ async def run(self, host: str) -> None: resp.status_code, "| reconnecting", ) + self._mark_deploy_unavailable( + f"Deploy service returned status code {resp.status_code}" + ) # Encountered a server error, retry. await asyncio.sleep(random.randint(1, 3)) @@ -301,11 +333,28 @@ async def run(self, host: str) -> None: break reconnect = False - except Exception as e: + except ( + requests.exceptions.ConnectTimeout, + requests.exceptions.ReadTimeout, + requests.exceptions.ConnectionError, + ) as e: + # Nginx down, VM down, network issue, docker-compose stopped if STATE.reconnect_with_credentials is not None: return None print_online = True + self._mark_deploy_unavailable(str(e) or "Deploy service connection error") print(e) print(Fore.YELLOW + "Retrying after 30s") await asyncio.sleep(30) + except Exception as e: + if STATE.reconnect_with_credentials is not None: + return None + print_online = True + self._mark_deploy_unavailable( + str(e) or "Unexpected deploy service error", + failed=True, + ) + print(e) + print(Fore.YELLOW + "Retrying after 30s") + await asyncio.sleep(30) diff --git a/Framework/node_server_state.py b/Framework/node_server_state.py index 6d4f57be..5ec07d7f 100644 --- a/Framework/node_server_state.py +++ b/Framework/node_server_state.py @@ -1,5 +1,8 @@ -from pydantic import BaseModel +from datetime import datetime, timezone from typing import Literal +from uuid import uuid4 + +from pydantic import BaseModel class LoginCredentials(BaseModel): @@ -9,6 +12,18 @@ class LoginCredentials(BaseModel): class ServerState(BaseModel): state: Literal["idle", "in_progress"] = "idle" + started_at: str = datetime.now(timezone.utc).isoformat() + instance_id: str = str(uuid4()) + connection_state: Literal[ + "disconnected", + "authenticating", + "connected", + "offline", + "failed", + ] = "disconnected" + connected_server: str | None = None + target_server: str | None = None + last_connect_error: str | None = None # Control variable to stop the next iteration of the deplopy service # connection loop. diff --git a/node_cli.py b/node_cli.py index e15c7612..8154712f 100755 --- a/node_cli.py +++ b/node_cli.py @@ -218,6 +218,9 @@ async def Login( log_dir: os.PathLike | None = None, ): console = Console() + STATE.target_server = server_name or None + STATE.last_connect_error = None + STATE.connection_state = "authenticating" if server_name else "disconnected" # Login to ZeuZ server. user_data = UserData( @@ -248,6 +251,11 @@ async def Login( # ConfigModule.add_config_value(AUTHENTICATION_TAG, "api-key", "dummy") if status_code == 200: + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "authenticating" + STATE.last_connect_error = None + user_data = UserData( username=data["user"]["username"], email=data["user"]["email"], @@ -278,6 +286,10 @@ async def Login( console.print(table) elif status_code == 502: print(Fore.YELLOW + "Server offline. Retrying after 30s") + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "offline" + STATE.last_connect_error = "Server offline" await asyncio.sleep(30) STATE.reconnect_with_credentials = LoginCredentials( server=ConfigModule.get_config_value(AUTHENTICATION_TAG, "server_address").strip('"').strip(), @@ -287,6 +299,10 @@ async def Login( else: line_color = Fore.RED print(line_color + "Incorrect credentials, please try again.") + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "failed" + STATE.last_connect_error = "Incorrect credentials" # server_name, api = zeuz_authentication_prompts_for_cli() # api = api.strip('"') @@ -295,10 +311,18 @@ async def Login( return except ConnectionError: print("Failed to connect to the server, retrying after 30s") + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "offline" + STATE.last_connect_error = "Failed to connect to the server" await asyncio.sleep(30) return except Exception as e: traceback.print_exc() + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "failed" + STATE.last_connect_error = str(e) return node_id = CommonUtil.MachineInfo().getLocalUser().lower() @@ -1384,6 +1408,10 @@ async def main(): ) if len(server_name) == 0 and len(api) == 0: + STATE.connected_server = None + STATE.target_server = None + STATE.connection_state = "disconnected" + STATE.last_connect_error = None console.print( "\n" + ":red_circle: " + "Zeuz Node is disconnected.", style="bold red", @@ -1392,6 +1420,10 @@ async def main(): await asyncio.sleep(1) else: + STATE.connected_server = None + STATE.target_server = server_name + STATE.connection_state = "authenticating" + STATE.last_connect_error = None asyncio.create_task( Login( server_name=server_name, @@ -1400,9 +1432,13 @@ async def main(): ) while True: if STATE.reconnect_with_credentials is not None: - await destroy_session() server_name = STATE.reconnect_with_credentials.server api_key = STATE.reconnect_with_credentials.api_key + STATE.connected_server = None + STATE.target_server = server_name or None + STATE.connection_state = "authenticating" if server_name and api_key else "disconnected" + STATE.last_connect_error = None + await destroy_session() await set_new_credentials(server=server_name, api_key=api_key) STATE.reconnect_with_credentials = None @@ -1418,6 +1454,10 @@ async def main(): ) if len(server_name) == 0 and len(api) == 0: + STATE.connected_server = None + STATE.target_server = None + STATE.connection_state = "disconnected" + STATE.last_connect_error = None console.print( "\n" + ":red_circle: " + "Zeuz Node is disconnected.", style="bold red", diff --git a/server/connect.py b/server/connect.py index c2ff6b2b..3ff04c3e 100644 --- a/server/connect.py +++ b/server/connect.py @@ -15,10 +15,16 @@ class ConnectRequest(BaseModel): async def set_new_credentials(server: str, api_key: str): """Store new credentials in the settings file.""" + server = server.strip() + api_key = api_key.strip() STATE.reconnect_with_credentials = LoginCredentials( - server=server.strip(), - api_key=api_key.strip(), + server=server, + api_key=api_key, ) + STATE.connected_server = None + STATE.target_server = server or None + STATE.last_connect_error = None + STATE.connection_state = "authenticating" if server and api_key else "disconnected" @router.post("", status_code=200) diff --git a/server/status.py b/server/status.py index bb55ec54..1882edff 100644 --- a/server/status.py +++ b/server/status.py @@ -30,30 +30,24 @@ def _get_version() -> str | None: router = APIRouter(prefix="/status", tags=["status"]) -class StateExecutionDetail(BaseModel): - """Returns the current state of the execution in Node.""" - - runid: str - tc_id: str - step_sequence: int - action_sequence: int - variables: dict[str, str] - logs: list[str] - - -class ConnectionStateResponse(BaseModel): - """Returns the current state of the Node.""" - - connected_server: str - execution_detail: StateExecutionDetail | None = None - - class StatusResponse(BaseModel): """Returns the current state of the Node.""" state: Literal["idle", "in_progress"] node_id: str | None = None version: str | None = None + started_at: str | None = None + instance_id: str | None = None + connection_state: Literal[ + "disconnected", + "authenticating", + "connected", + "offline", + "failed", + ] = "disconnected" + connected_server: str | None = None + target_server: str | None = None + last_connect_error: str | None = None @router.get("") @@ -65,4 +59,14 @@ def status(): node_id = id except Exception: node_id = "unknown" - return StatusResponse(state=STATE.state, node_id=node_id, version=_get_version()) + return StatusResponse( + state=STATE.state, + node_id=node_id, + version=_get_version(), + started_at=STATE.started_at, + instance_id=STATE.instance_id, + connection_state=STATE.connection_state, + connected_server=STATE.connected_server, + target_server=STATE.target_server, + last_connect_error=STATE.last_connect_error, + )