remove global lock

Currently, the update/apptask/fullbackup/platformstart take a
global lock and cannot run in parallel. This causes situations
where when a user tries to trigger an apptask, it says "waiting for
backup to finish..." etc

The solution is to let them run in parallel. We need a lock at the
app level as app operations running in parallel would be bad (tm).
In addition, the update task needs a lock just for the update part.
We also need multi-process locks. Running tasks as processes is core
to our "kill" strategy.

Various inter process locks were explored:

* node's IPC mechanism with process.send(). But this only works for direct node.js
children. taskworker is run via sudo and the IPC does not work.

* File lock using O_EXCL. Basic ideas to create lock files. While file creation
can be done atomically, it becomes complicated to clean up lock files when
the tasks crash. We need a way to know what locks were held by the crashing task.
flock and friends are not built-into node.js

* sqlite/redis were options but introduce additional deps

* Settled on MySQL based locking. Initial plan was to have row locks
or table locks. Each row is a kind of lock. While implementing, it was found that
we need many types of locks (and not just update lock and app locks). For example,
we need locks for each task type, so that only one task type is active at a time.

* Instead of rows, we can just lock table and have a json blob in it. This hit a road
block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e
when issing two db.query() it might use two different connections from the pool. We have to
expose the connection, release connection etc.

* Next idea was atomic blob update of the blob checking if old blob was same. This approach,
was finally refined into a version field.

Phew!
This commit is contained in:
Girish Ramakrishnan
2024-12-07 14:35:45 +01:00
parent a5b9ff0c3a
commit bb392207ea
13 changed files with 263 additions and 149 deletions
+42 -56
View File
@@ -8,84 +8,70 @@ const assert = require('assert'),
BoxError = require('./boxerror.js'),
debug = require('debug')('box:apptaskmanager'),
fs = require('fs'),
locker = require('./locker.js'),
safe = require('safetydance'),
locks = require('./locks.js'),
path = require('path'),
paths = require('./paths.js'),
safe = require('safetydance'),
scheduler = require('./scheduler.js'),
tasks = require('./tasks.js');
const gActiveTasks = {}; // indexed by app id
const gPendingTasks = [];
let gInitialized = false;
const TASK_CONCURRENCY = 3;
function waitText(lockOperation) {
if (lockOperation === locker.OP_BOX_UPDATE) return 'Waiting for Cloudron to finish updating. See the Settings view';
if (lockOperation === locker.OP_INFRA_START) return 'Waiting for Platform Services to start. See the Services view';
if (lockOperation === locker.OP_FULL_BACKUP) return 'Waiting for Cloudron to finish backup. See the Backups view';
let gTimerId = null;
return ''; // cannot happen
}
function initializeSync() {
gInitialized = true;
locker.on('unlocked', startNextTask);
}
// callback is called when task is finished
function scheduleTask(appId, taskId, options, onFinished) {
assert.strictEqual(typeof appId, 'string');
assert.strictEqual(typeof taskId, 'string');
assert.strictEqual(typeof options, 'object');
assert.strictEqual(typeof onFinished, 'function');
if (!gInitialized) initializeSync();
if (appId in gActiveTasks) {
return onFinished(new BoxError(BoxError.CONFLICT, `Task for ${appId} is already active`));
}
if (Object.keys(gActiveTasks).length >= TASK_CONCURRENCY) {
debug(`Reached concurrency limit, queueing task id ${taskId}`);
tasks.update(taskId, { percent: 1, message: 'Waiting for other app tasks to complete' });
gPendingTasks.push({ appId, taskId, options, onFinished });
onFinished(new BoxError(BoxError.CONFLICT, `Task for ${appId} is already active`));
return;
}
const lockError = locker.recursiveLock(locker.OP_APPTASK);
tasks.update(taskId, { percent: 1, message: 'Queued' });
gPendingTasks.push({ appId, taskId, options, onFinished });
if (lockError) {
debug(`Could not get lock. ${lockError.message}, queueing task id ${taskId}`);
tasks.update(taskId, { percent: 1, message: waitText(lockError.operation) });
gPendingTasks.push({ appId, taskId, options, onFinished });
return;
if (!gTimerId) gTimerId = setTimeout(drain, 1000);
}
async function drain() {
debug(`drain: ${gPendingTasks.length} apptasks pending`);
for (let i = 0; i < gPendingTasks.length; i++) {
const space = Object.keys(gActiveTasks).length - TASK_CONCURRENCY;
if (space == 0) {
debug('At concurrency limit, cannot drain anymore');
break;
}
const { appId, taskId, options, onFinished } = gPendingTasks[i];
const [lockError] = await safe(locks.acquire(`${locks.TYPE_APP_PREFIX}${appId}`));
if (lockError) continue;
gPendingTasks.splice(i, 1);
gActiveTasks[appId] = {};
const logFile = path.join(paths.LOG_DIR, appId, 'apptask.log');
if (!fs.existsSync(path.dirname(logFile))) safe.fs.mkdirSync(path.dirname(logFile)); // ensure directory
scheduler.suspendJobs(appId);
tasks.startTask(taskId, Object.assign(options, { logFile }), async function (error, result) {
onFinished(error, result);
delete gActiveTasks[appId];
await locks.release(`${locks.TYPE_APP_PREFIX}${appId}`);
scheduler.resumeJobs(appId);
});
}
gActiveTasks[appId] = {};
const logFile = path.join(paths.LOG_DIR, appId, 'apptask.log');
if (!fs.existsSync(path.dirname(logFile))) safe.fs.mkdirSync(path.dirname(logFile)); // ensure directory
scheduler.suspendJobs(appId);
tasks.startTask(taskId, Object.assign(options, { logFile }), function (error, result) {
onFinished(error, result);
delete gActiveTasks[appId];
locker.unlock(locker.OP_APPTASK); // unlock event will trigger next task
scheduler.resumeJobs(appId);
});
}
function startNextTask() {
if (gPendingTasks.length === 0) return;
assert(Object.keys(gActiveTasks).length < TASK_CONCURRENCY);
const t = gPendingTasks.shift();
scheduleTask(t.appId, t.taskId, t.options, t.onFinished);
gTimerId = null;
if (gPendingTasks.length) gTimerId = setTimeout(drain, 1000); // check for released locks
}