shell: add timeout logic and rework error handling

what's important:

* if task code ran, it exits with 0. this code is regardless of (error, result)
  * when it exited cleanly, we will get the values from the database

* if task timed out, the box code kills it and it has a flag tracking timedOut. we can
  ignore exit code in this case.

* if task code was stopped, box code will send SIGTERM which ideally it will handle and end with 70.

* if task code crashed and it caught the exception, it will return 50

* if task code crashed and node nuked us, it will exit with 1

* if task code was killed with some unhandleabe signal, taskworker.sh will return the signal (9=SIGKILL)
This commit is contained in:
Girish Ramakrishnan
2025-07-17 09:53:29 +02:00
parent 5e1c32b606
commit 7047ee9391
7 changed files with 96 additions and 78 deletions

View File

@@ -5,6 +5,7 @@ const assert = require('assert'),
child_process = require('child_process'),
debug = require('debug')('box:shell'),
path = require('path'),
safe = require('safetydance'),
_ = require('./underscore.js');
exports = module.exports = shell;
@@ -48,10 +49,10 @@ function spawn(tag, file, args, options) {
const abortSignal = options.abortSignal || null; // note: we use our own handler and not the child_process one
return new Promise((resolve, reject) => {
const spawnOptions = _.omit(options, [ 'maxLines', 'logger', 'abortSignal', 'onMessage', 'input', 'encoding' ]);
const spawnOptions = _.omit(options, [ 'maxLines', 'logger', 'abortSignal', 'onMessage', 'input', 'encoding', 'timeout', 'onTimeout' ]);
const cp = child_process.spawn(file, args, spawnOptions);
const stdoutBuffers = [], stderrBuffers = [];
let stdoutLineCount = 0, stderrLineCount = 0;
let stdoutLineCount = 0, stderrLineCount = 0, killTimerId = null, timedOut = false, terminated = false;
cp.stdout.on('data', (data) => {
if (logger) return logger(data);
@@ -69,7 +70,11 @@ function spawn(tag, file, args, options) {
cp.on('close', function (code, signal) { // always called. after 'exit' or 'error'
const stdoutBuffer = Buffer.concat(stdoutBuffers);
const stdout = options.encoding ? stdoutBuffer.toString(options.encoding) : stdoutBuffer;
if (code === 0) return resolve(stdout);
if (killTimerId) clearTimeout(killTimerId);
// if terminated or timedout, the code is ignored
if (!terminated && !timedOut && code === 0) return resolve(stdout);
const stderrBuffer = Buffer.concat(stderrBuffers);
const stderr = options.encoding ? stderrBuffer.toString(options.encoding) : stderrBuffer;
@@ -81,6 +86,8 @@ function spawn(tag, file, args, options) {
e.stderrLineCount = stderrLineCount;
e.code = code;
e.signal = signal;
e.timedOut = timedOut;
e.terminated = terminated;
debug(`${tag}: ${file} ${args.join(' ').replace(/\n/g, '\\n')} errored`, e);
@@ -91,16 +98,31 @@ function spawn(tag, file, args, options) {
debug(`${tag}: ${file} ${args.join(' ').replace(/\n/g, '\\n')} errored`, error);
});
abortSignal?.addEventListener('abort', () => {
debug(`${tag}: aborting ${cp.pid}`);
cp.terminate = function () {
terminated = true;
// many approaches to kill sudo launched process failed. we now have a sudo wrapper to kill the full tree
child_process.execFile('/usr/bin/sudo', [ KILL_CHILD_CMD, cp.pid, process.pid ], { encoding: 'utf8' }, (error, stdout, stderr) => {
if (error) debug(`${tag}: failed to kill children`, stdout, stderr);
else debug(`${tag}: aborted ${cp.pid}`, stdout, stderr);
else debug(`${tag}: terminated ${cp.pid}`, stdout, stderr);
});
};
abortSignal?.addEventListener('abort', () => {
debug(`${tag}: aborting ${cp.pid}`);
cp.terminate();
}, { once: true });
if (options.onMessage) cp.on('message', options.onMessage); // ipc mode messages
if (options.timeout) {
killTimerId = setTimeout(async () => {
debug(`${tag}: timedout`);
timedOut = true;
if (typeof options.onTimeout !== 'function') return cp.terminate();
await safe(options.onTimeout(), { debug });
}, options.timeout);
}
// https://github.com/nodejs/node/issues/25231
if ('input' in options) { // when empty, just closes
cp.stdin.write(options.input);