Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

main exit #912

Merged
merged 3 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ jobs:
strategy:
fail-fast: false
matrix:
test: ["sanity", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist"]
test: ["sanity", "ns_lb_change", "no_subsystems", "state_transitions", "state_transitions_both_gws", "state_transitions_loop", "state_transitions_rand_loop", "late_registration", "late_registration_loop", "4gws", "4gws_loop", "4gws_create_delete", "4gws_create_delete_loop", "namespaces", "namespaces_loop", "mtls", "notify", "ceph_status", "blocklist", "main_exit"]
runs-on: ubuntu-latest
env:
HUGEPAGES: 1024 # 4 spdk instances
Expand Down
14 changes: 3 additions & 11 deletions control/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@
#

import argparse
import signal
from .server import GatewayServer
from .config import GatewayConfig
from .utils import GatewayLogger

def sigterm_handler(signum, frame):
raise SystemExit(f"Gateway process terminated")

if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="python3 -m control",
description="Manage NVMe gateways",
Expand All @@ -32,10 +28,6 @@ def sigterm_handler(signum, frame):
gw_logger = GatewayLogger(config)
config.display_environment_info(gw_logger.logger)
config.dump_config_file(gw_logger.logger)
try:
with GatewayServer(config) as gateway:
signal.signal(signal.SIGTERM, sigterm_handler)
gateway.serve()
gateway.keep_alive()
except SystemExit:
pass
with GatewayServer(config) as gateway:
gateway.serve()
gateway.keep_alive()
9 changes: 9 additions & 0 deletions control/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
from .cephutils import CephUtils
from .prometheus import start_exporter

def sigterm_handler(signum, frame):
"""Handle SIGTERM, runs when a gateway is terminated gracefully."""
logger = GatewayLogger().logger
logger.info(f"GatewayServer: SIGTERM received {signum=}")
raise SystemExit(0)

def sigchld_handler(signum, frame):
"""Handle SIGCHLD, runs when a child process, like the spdk, terminates."""
logger = GatewayLogger().logger
Expand Down Expand Up @@ -205,6 +211,9 @@ def serve(self):
# install SIGCHLD handler
signal.signal(signal.SIGCHLD, sigchld_handler)

# install SIGTERM handler
signal.signal(signal.SIGTERM, sigterm_handler)

# Start monitor client
self._start_monitor_client()

Expand Down
111 changes: 111 additions & 0 deletions tests/ha/main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/bin/sh
set -xe
SCALE=1
POOL="${RBD_POOL:-rbd}"

background_task() {

# Give gateway some time
sleep 5

# Waiting for the ceph container to become healthy
while true; do
container_status=$(docker inspect --format='{{.State.Health.Status}}' ceph)
if [ "$container_status" = "healthy" ]; then
# success
break
else
# Wait for a specific time before checking again
sleep 1
printf .
fi
done
echo ✅ ceph is healthy

echo ℹ️ Running processes of services
docker compose top

echo ℹ️ Send nvme-gw create for all gateways
GW_GROUP=''
i=1 # a single gw index
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
echo 📫 nvme-gw create gateway: \'$GW_NAME\' pool: \'$POOL\', group: \'$GW_GROUP\'
docker compose exec -T ceph ceph nvme-gw create $GW_NAME $POOL "$GW_GROUP"

echo ℹ️ Wait for gateway to be ready
while true; do
sleep 1 # Adjust the sleep duration as needed
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
continue
fi
GW_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW_NAME")"
if docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | grep -i failed; then
echo "Container $i $GW_NAME $GW_IP no subsystems. Waiting..."
continue
fi
echo "Container $i $GW_NAME $GW_IP subsystems:"
docker compose run --rm nvmeof-cli $CLI_TLS_ARGS --server-address $GW_IP --server-port 5500 get_subsystems
break
done

# Signal to send (e.g., SIGTERM or SIGKILL)
SIGNAL="SIGABRT"

# Get the PID of monitor_client inside the container
PID=$(docker exec "$GW_NAME" sh -c "for pid in /proc/*; do
if [ -f \"\$pid/comm\" ] && grep -q 'ceph-nvmeof-mon' \"\$pid/comm\"; then
echo \$(basename \$pid)
break
fi
done")

if [ -n "$PID" ]; then
echo "ℹ️ Sending $SIGNAL to monitor_client (PID: $PID) in $GW_NAME..."
docker exec "$GW_NAME" kill -s "$SIGNAL" "$PID"
else
echo "❌ monitor_client process not found in $GW_NAME."
exit 1
fi

}

##
## MAIN
##

background_task &
TASK_PID=$! # Capture the PID of the background task

echo ℹ️ Starting $SCALE nvmeof gateways
docker compose up --remove-orphans --scale nvmeof=$SCALE nvmeof
GW_NAME=$(docker ps -a --format '{{.ID}}\t{{.Names}}' | grep -v discovery | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
docker inspect "$GW_NAME"
exit_code=$(docker inspect --format='{{.State.ExitCode}}' "$GW_NAME")

# expect exit code 1
if [ $exit_code -eq 1 ]; then
echo ✅ gateway returned exit code 1, exiting with success.
else
echo ❌ gateway returned exit code $exit_code, exiting with failure.
exit 1 # Failure exit code
fi

# Wait for the background task to finish
wait $TASK_PID # Wait for the specific PID to complete
background_task_exit_code=$? # Capture the exit code of the background task

# Check the exit code and print the result
if [ $background_task_exit_code -eq 0 ]; then
echo ✅ background task completed successfully
else
echo ❌ background task failed with exit code: $background_task_exit_code
fi

# Exit with the same code as the background task
exit $background_task_exit_code


2 changes: 2 additions & 0 deletions tests/ha/setup_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping setup for this test
2 changes: 2 additions & 0 deletions tests/ha/start_up_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping start up for this test
4 changes: 3 additions & 1 deletion tests/ha/wait_gateways.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/sh
set -ex
SCALE=2
echo CLI_TLS_ARGS $CLI_TLS_ARGS
# Check if argument is provided
Expand All @@ -16,7 +18,7 @@ for i in $(seq $SCALE); do
sleep 1 # Adjust the sleep duration as needed
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
if [ "$container_status" = "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
Expand Down
2 changes: 2 additions & 0 deletions tests/ha/wait_gateways_main_exit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set -e
echo ℹ️ Skipping wait gateways up for this test
Loading