shell: add timeout logic and rework error handling

what's important:

* if task code ran, it exits with 0. this code is regardless of (error, result)
  * when it exited cleanly, we will get the values from the database

* if task timed out, the box code kills it and it has a flag tracking timedOut. we can
  ignore exit code in this case.

* if task code was stopped, box code will send SIGTERM which ideally it will handle and end with 70.

* if task code crashed and it caught the exception, it will return 50

* if task code crashed and node nuked us, it will exit with 1

* if task code was killed with some unhandleabe signal, taskworker.sh will return the signal (9=SIGKILL)
This commit is contained in:
Girish Ramakrishnan
2025-07-17 09:53:29 +02:00
parent 5e1c32b606
commit 7047ee9391
7 changed files with 96 additions and 78 deletions

View File

@@ -17,6 +17,7 @@ const apptask = require('./apptask.js'),
reverseProxy = require('./reverseproxy.js'),
safe = require('safetydance'),
tasks = require('./tasks.js'),
timers = require('timers/promises'),
updater = require('./updater.js');
const TASKS = { // indexed by task type
@@ -34,7 +35,7 @@ const TASKS = { // indexed by task type
_identity: async (arg, progressCallback) => { progressCallback({ percent: 20 }); return arg; },
_error: async (arg, progressCallback) => { progressCallback({ percent: 20 }); throw new Error(`Failed for arg: ${arg}`); },
_crash: (arg) => { throw new Error(`Crashing for arg: ${arg}`); }, // the test looks for this debug string in the log file
_sleep: async (arg) => setTimeout(process.exit, arg)
_sleep: async (arg) => await timers.setTimeout(parseInt(arg, 10))
};
if (process.argv.length !== 4) {
@@ -62,6 +63,8 @@ async function setupNetworking() {
}
// this is also used as the 'uncaughtException' handler which can only have synchronous functions
// taskworker.sh forwards the exit code of the actual worker. It's either a raw signal number OR the exit code. So, choose exit codes > 31
// 50 - internal error , 70 - SIGTERM exit
function exitSync(status) {
if (status.error) fs.write(logFd, status.error.stack + '\n', function () {});
fs.fsyncSync(logFd);
@@ -87,7 +90,7 @@ async function main() {
process.on('SIGTERM', () => {
debug('Terminated');
exitSync({ code: 0 });
exitSync({ code: 70 });
});
// ensure we log task crashes with the task logs. neither console.log nor debug are sync for some reason
@@ -115,9 +118,9 @@ async function main() {
debug(`Task took ${(new Date() - startTime)/1000} seconds`);
await safe(tasks.setCompleted(taskId, progress));
exitSync({ error: runError, code: runError ? 50 : 0 });
exitSync({ error: runError, code: 0 }); // code itself ran fine, but resulted in some error. so exit with success
} catch (error) {
exitSync({ error, code: 1 }); // do not call setCompleted() intentionally. the task code must be resilient enough to handle it
exitSync({ error, code: 50 }); // do not call setCompleted() intentionally. the task code must be resilient enough to handle it
}
}