diff --git a/CHANGES b/CHANGES index 0edd922fb..d2ab3b306 100644 --- a/CHANGES +++ b/CHANGES @@ -2949,4 +2949,5 @@ * mongodb: reduce verbosity of logs * redis: disable by default when optional * apps: fix issue where operations on stopped apps errored +* eventlog: Fix incorrect eventlog that the update crashed diff --git a/setup/start/systemd/box.service b/setup/start/systemd/box.service index 76d31f978..140205228 100644 --- a/setup/start/systemd/box.service +++ b/setup/start/systemd/box.service @@ -10,21 +10,23 @@ WantedBy=multi-user.target [Service] Type=idle WorkingDirectory=/home/yellowtent/box +User=yellowtent +Group=yellowtent Restart=always ExecStart=/home/yellowtent/box/box.js ExecReload=/bin/kill -HUP $MAINPID ; we run commands like df which will parse properly only with correct locale ; add "oidc-provider:*" to DEBUG for OpenID debugging Environment="HOME=/home/yellowtent" "USER=yellowtent" "DEBUG=box:*,connect-lastmile,-box:ldap" "BOX_ENV=cloudron" "NODE_ENV=production" "LC_ALL=C" -; kill apptask processes as well -KillMode=control-group +; this sends the main process SIGTERM and then if anything lingers to the control-group . this is also the case if the main process crashes. +; the box code handles SIGTERM and cleanups the tasks +KillMode=mixed +KillSignal=SIGTERM +TimeoutStopSec=5s +SendSIGKILL=yes ; Do not kill this process on OOM. Children inherit this score. Do not set it to -1000 so that MemoryMax can keep working OOMScoreAdjust=-999 -User=yellowtent -Group=yellowtent -; OOM killer is invoked in this unit beyond this. The start script replaces this with MemoryLimit for Ubuntu 16 MemoryMax=400M -TimeoutStopSec=5s StartLimitInterval=1 StartLimitBurst=60 diff --git a/src/platform.js b/src/platform.js index e035c5ce3..e6f11748b 100644 --- a/src/platform.js +++ b/src/platform.js @@ -165,7 +165,7 @@ async function initialize() { debug('initialize: start platform'); await database.initialize(); - await tasks.stopAllTasks(); + await tasks.stopAllTasks(); // when box code crashes, systemd will clean up the control-group but not the tasks await locks.releaseAll(); // always generate webadmin config since we have no versioning mechanism for the ejs @@ -191,7 +191,7 @@ async function uninitialize() { if (await users.isActivated()) await onDeactivated(); - await tasks.stopAllTasks(); + await tasks.stopAllTasks(); // when box code is stopped/restarted, we get a chance to cleanup all the sudo+tasks await database.uninitialize(); } diff --git a/src/scripts/starttask.sh b/src/scripts/starttask.sh index 25042ab1f..48a69bffe 100755 --- a/src/scripts/starttask.sh +++ b/src/scripts/starttask.sh @@ -35,13 +35,16 @@ systemctl reset-failed "${service_name}" 2>/dev/null || true options="-p TimeoutStopSec=10s -p MemoryMax=${memory_limit_mb}M -p OOMScoreAdjust=${oom_score_adjust} --wait" # systemd-run is used to create resource limited tasks. the tasks are in separate cgroup and won't get affected by box start/stop -# 1. tasks should stop when box code is stopped. in this state, dashboard us unreachable and don't want things in background. -# 2. if tasks continue running, box code needs some reconcilation code to track tasks. systemd has no mechanism to handle both stop and restart. -# when using BindsTo=box.service , the tasks restart with systemctl restart. This defeats any point of tasks running in background if they start afresh. +# this design is because tasks should not run when box code is stopped. if dashboard is down, nothing should run -in background. +# besides, if tasks continue running, box code needs complex reconcilation code on start up. +# To achieve above design, we could use BindsTo=box.service. While this stops all tasks on systemctl stop, it restarts tasks on systemctl restart. +# Another approach was to use --scope but this is incompatible with --wait (and then we have to start polling status to get exit code) -# it seems systemd-run does not return the exit status of the process despite --wait -if ! systemd-run --unit "${service_name}" --nice "${nice}" --uid=${id} --gid=${id} ${options} --setenv HOME=${HOME} --setenv USER=${SUDO_USER} \ - --setenv DEBUG=box:* --setenv BOX_ENV=${BOX_ENV} --setenv NODE_ENV=production "${task_worker}" "${task_id}" "${logfile}"; then +# it seems systemd-run does not return the exit status of the process despite --wait but atleast it waits +if ! systemd-run --unit "${service_name}" --wait --uid=${id} --gid=${id} \ + -p TimeoutStopSec=2s -p MemoryMax=${memory_limit_mb}M -p OOMScoreAdjust=${oom_score_adjust} --nice "${nice}" \ + --setenv HOME=${HOME} --setenv USER=${SUDO_USER} --setenv DEBUG=box:* --setenv BOX_ENV=${BOX_ENV} --setenv NODE_ENV=production \ + "${task_worker}" "${task_id}" "${logfile}"; then echo "Service ${service_name} failed to run" # this only happens if the path to task worker itself is wrong fi diff --git a/src/scripts/update.sh b/src/scripts/update.sh index a81de1690..6843023f2 100755 --- a/src/scripts/update.sh +++ b/src/scripts/update.sh @@ -30,21 +30,16 @@ readonly installer_path="${source_dir}/scripts/installer.sh" log "updating Cloudron with ${source_dir}" -# StandardError will follow StandardOutput in default inherit mode. https://www.freedesktop.org/software/systemd/man/systemd.exec.html systemctl reset-failed "${updater_service}" 2>/dev/null || true -if ! systemd-run --property=OOMScoreAdjust=-1000 --unit "${updater_service}" -p StandardOutput=append:${log_file} ${installer_path}; then - log "Failed to install cloudron" - exit 1 + +# this script is invoked as a task. installer.sh will systemctl stop at some point and that will stop all the tasks +# installer.sh is thus run as a separate unit name so that it doesn't get killed during the stoptask.sh +# StandardError will follow StandardOutput in default inherit mode. https://www.freedesktop.org/software/systemd/man/systemd.exec.html +if ! systemd-run --property=OOMScoreAdjust=-1000 --unit "${updater_service}" --wait -p StandardOutput=append:${log_file} ${installer_path}; then + echo "${updater_service} failed to run" # this only happens if the path to installer is wrong fi -while true; do - if systemctl is-failed "${updater_service}" >/dev/null 2>&1; then - log "${updater_service} failed" - exit 1 - fi - - log "${updater_service} is still active. will check in 5 seconds" - - sleep 5 - # this loop will stop once the update process stopped the box unit and thus terminating this child process -done +# if the install script succeeded, the following code is never run since this script will get killed +exit_code=$(systemctl show "${service_name}" -p ExecMainCode | sed 's/ExecMainCode=//g') +echo "${updater_service} finished with exit code ${exit_code}" +exit "${exit_code}" diff --git a/src/shell.js b/src/shell.js index 81dc0a519..932c2d0eb 100644 --- a/src/shell.js +++ b/src/shell.js @@ -159,12 +159,11 @@ function sudo(tag, args, options, callback) { // sudo does not forward signals when the originator comes from the same process group. recently, there has been a change where it will // forward signals as long as sudo or the command is not the group leader (https://www.sudo.ws/repos/sudo/rev/d1bf60eac57f) // for us, this means that calling kill from this node process doesn't work since it's in the same group (and ubuntu 22 doesn't have the above fix). - // the workaround is to invoke a kill from a different process group and this is done by starting detached + // the workaround is to invoke a kill from a different process group and this is done by starting detached . negative pid mean to group // another idea is: use "ps --pid cp.pid -o pid=" to get the pid of the command and then send it signal directly - // ...despite all this hackery this only appears to work for sudo tail and not for sudo systemd-run . go figure. cp.terminate = function () { cp.terminated = true; // hint for better debug message in 'exit' - child_process.spawn('kill', ['-SIGTERM', cp.pid], { detached: true }, (error) => { if (error) debug(`${tag} could not terminate`, error); }); + child_process.spawn('kill', ['-SIGKILL', -cp.pid], { detached: true }, (error) => { if (error) debug(`${tag} could not terminate`, error); }); }; cp.stdin.end(); diff --git a/src/tasks.js b/src/tasks.js index 5beb3e79a..b6c9182f9 100644 --- a/src/tasks.js +++ b/src/tasks.js @@ -234,9 +234,10 @@ async function stopTask(id) { } async function stopAllTasks() { - debug('stopAllTasks: stopping all tasks'); - + debug('stopAllTasks: ignoring exit status of starttask'); + const cps = Object.values(gTasks); gTasks = {}; // this signals startTask() to not set completion status as "crashed" + cps.forEach(cp => { debug(`stopAllTasks: terminating process group ${cp.pid}`); cp.terminate(); }); // cleanup all the sudos and systemd-run const [error] = await safe(shell.promises.sudo([ STOP_TASK_CMD, 'all' ], { cwd: paths.baseDir() })); if (error) debug(`stopAllTasks: error stopping stasks: ${error.message}`); }