From 95af819d3d0c6dc4ea786029aa142efbabcdde94 Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Tue, 22 Oct 2024 07:48:15 +0000 Subject: [PATCH 1/3] main exit 1. Do not catch System exit 2. Return exit code 0 from TERM signal handler partial revert of - 6c120cf0 - 042cf8c6 Signed-off-by: Alexander Indenbaum --- control/__main__.py | 14 +++----------- control/server.py | 9 +++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/control/__main__.py b/control/__main__.py index c9742dfd..5ec1c091 100644 --- a/control/__main__.py +++ b/control/__main__.py @@ -8,14 +8,10 @@ # import argparse -import signal from .server import GatewayServer from .config import GatewayConfig from .utils import GatewayLogger -def sigterm_handler(signum, frame): - raise SystemExit(f"Gateway process terminated") - if __name__ == '__main__': parser = argparse.ArgumentParser(prog="python3 -m control", description="Manage NVMe gateways", @@ -32,10 +28,6 @@ def sigterm_handler(signum, frame): gw_logger = GatewayLogger(config) config.display_environment_info(gw_logger.logger) config.dump_config_file(gw_logger.logger) - try: - with GatewayServer(config) as gateway: - signal.signal(signal.SIGTERM, sigterm_handler) - gateway.serve() - gateway.keep_alive() - except SystemExit: - pass + with GatewayServer(config) as gateway: + gateway.serve() + gateway.keep_alive() diff --git a/control/server.py b/control/server.py index 27073434..9be0673f 100644 --- a/control/server.py +++ b/control/server.py @@ -36,6 +36,12 @@ from .cephutils import CephUtils from .prometheus import start_exporter +def sigterm_handler(signum, frame): + """Handle SIGTERM, runs when a gateway is terminated gracefully.""" + logger = GatewayLogger().logger + logger.info(f"GatewayServer: SIGTERM received {signum=}") + raise SystemExit(0) + def sigchld_handler(signum, frame): """Handle SIGCHLD, runs when a child process, like the spdk, terminates.""" logger = GatewayLogger().logger @@ -205,6 +211,9 @@ def serve(self): # install SIGCHLD handler signal.signal(signal.SIGCHLD, sigchld_handler) + # install SIGTERM handler + signal.signal(signal.SIGTERM, sigterm_handler) + # Start monitor client self._start_monitor_client() From e2eecc524d0144bbb16f8ba19342d7a6c16cefab Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Tue, 22 Oct 2024 08:41:46 +0000 Subject: [PATCH 2/3] initial main exit system test Signed-off-by: Alexander Indenbaum --- .github/workflows/build-container.yml | 2 +- tests/ha/main_exit.sh | 111 ++++++++++++++++++++++++++ tests/ha/setup_main_exit.sh | 2 + tests/ha/start_up_main_exit.sh | 2 + tests/ha/wait_gateways_main_exit.sh | 2 + 5 files changed, 118 insertions(+), 1 deletion(-) create mode 100755 tests/ha/main_exit.sh create mode 100755 tests/ha/setup_main_exit.sh create mode 100755 tests/ha/start_up_main_exit.sh create mode 100755 tests/ha/wait_gateways_main_exit.sh diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 112918d3..c8843ca9 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -538,7 +538,7 @@ jobs: strategy: fail-fast: false matrix: - test: ["sanity", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist"] + test: ["sanity", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist", "main_exit"] runs-on: ubuntu-latest env: HUGEPAGES: 1024 # 4 spdk instances diff --git a/tests/ha/main_exit.sh b/tests/ha/main_exit.sh new file mode 100755 index 00000000..3502f861 --- /dev/null +++ b/tests/ha/main_exit.sh @@ -0,0 +1,111 @@ +#!/bin/sh +set -xe +SCALE=1 +POOL="${RBD_POOL:-rbd}" + +background_task() { + + # Give gateway some time + sleep 5 + + # Waiting for the ceph container to become healthy + while true; do + container_status=$(docker inspect --format='{{.State.Health.Status}}' ceph) + if [ "$container_status" = "healthy" ]; then + # success + break + else + # Wait for a specific time before checking again + sleep 1 + printf . + fi + done + echo ✅ ceph is healthy + + echo ℹ️ Running processes of services + docker compose top + + echo ℹ️ Send nvme-gw create for all gateways + GW_GROUP='' + i=1 # a single gw index + GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}') + echo 📫 nvme-gw create gateway: \'$GW_NAME\' pool: \'$POOL\', group: \'$GW_GROUP\' + docker compose exec -T ceph ceph nvme-gw create $GW_NAME $POOL "$GW_GROUP" + + echo ℹ️ Wait for gateway to be ready + while true; do + sleep 1 # Adjust the sleep duration as needed + container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME") + if [ "$container_status" == "running" ]; then + echo "Container $i $GW_NAME is now running." + else + echo "Container $i $GW_NAME is still not running. Waiting..." + continue + fi + GW_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW_NAME")" + if docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | grep -i failed; then + echo "Container $i $GW_NAME $GW_IP no subsystems. Waiting..." + continue + fi + echo "Container $i $GW_NAME $GW_IP subsystems:" + docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems + break + done + + # Signal to send (e.g., SIGTERM or SIGKILL) + SIGNAL="SIGABRT" + + # Get the PID of monitor_client inside the container + PID=$(docker exec "$GW_NAME" sh -c "for pid in /proc/*; do + if [ -f \"\$pid/comm\" ] && grep -q 'ceph-nvmeof-mon' \"\$pid/comm\"; then + echo \$(basename \$pid) + break + fi + done") + + if [ -n "$PID" ]; then + echo "ℹ️ Sending $SIGNAL to monitor_client (PID: $PID) in $GW_NAME..." + docker exec "$GW_NAME" kill -s "$SIGNAL" "$PID" + else + echo "❌ monitor_client process not found in $GW_NAME." + exit 1 + fi + +} + +## +## MAIN +## + +background_task & +TASK_PID=$! # Capture the PID of the background task + +echo ℹ️ Starting $SCALE nvmeof gateways +docker compose up --remove-orphans --scale nvmeof=$SCALE nvmeof +GW_NAME=$(docker ps -a --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}') +docker inspect "$GW_NAME" +exit_code=$(docker inspect --format='{{.State.ExitCode}}' "$GW_NAME") + +# expect exit code 1 +if [ $exit_code -eq 1 ]; then + echo ✅ gateway returned exit code 1, exiting with success. +else + echo ❌ gateway returned exit code $exit_code, exiting with failure. + exit 1 # Failure exit code +fi + +# Wait for the background task to finish +wait $TASK_PID # Wait for the specific PID to complete +background_task_exit_code=$? # Capture the exit code of the background task + +# Check the exit code and print the result +if [ $background_task_exit_code -eq 0 ]; then + echo ✅ background task completed successfully +else + echo ❌ background task failed with exit code: $background_task_exit_code +fi + +# Exit with the same code as the background task +exit $background_task_exit_code + + diff --git a/tests/ha/setup_main_exit.sh b/tests/ha/setup_main_exit.sh new file mode 100755 index 00000000..3db92b9e --- /dev/null +++ b/tests/ha/setup_main_exit.sh @@ -0,0 +1,2 @@ +set -e +echo ℹ️ Skipping setup for this test diff --git a/tests/ha/start_up_main_exit.sh b/tests/ha/start_up_main_exit.sh new file mode 100755 index 00000000..4e71dfbc --- /dev/null +++ b/tests/ha/start_up_main_exit.sh @@ -0,0 +1,2 @@ +set -e +echo ℹ️ Skipping start up for this test diff --git a/tests/ha/wait_gateways_main_exit.sh b/tests/ha/wait_gateways_main_exit.sh new file mode 100755 index 00000000..aabd3a85 --- /dev/null +++ b/tests/ha/wait_gateways_main_exit.sh @@ -0,0 +1,2 @@ +set -e +echo ℹ️ Skipping wait gateways up for this test From 88d9021b2dea43915da34f1a5f387daed2df1178 Mon Sep 17 00:00:00 2001 From: Alexander Indenbaum Date: Tue, 22 Oct 2024 10:11:49 +0000 Subject: [PATCH 3/3] tweak wait for gateways ha test Signed-off-by: Alexander Indenbaum --- tests/ha/wait_gateways.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ha/wait_gateways.sh b/tests/ha/wait_gateways.sh index d956b334..6edc2f6c 100755 --- a/tests/ha/wait_gateways.sh +++ b/tests/ha/wait_gateways.sh @@ -1,3 +1,5 @@ +#!/bin/sh +set -ex SCALE=2 echo CLI_TLS_ARGS $CLI_TLS_ARGS # Check if argument is provided @@ -16,7 +18,7 @@ for i in $(seq $SCALE); do sleep 1 # Adjust the sleep duration as needed GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}') container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME") - if [ "$container_status" == "running" ]; then + if [ "$container_status" = "running" ]; then echo "Container $i $GW_NAME is now running." else echo "Container $i $GW_NAME is still not running. Waiting..."