src/locks.js

'use strict';

exports = module.exports = {
    setTaskId,

    acquire,
    wait,

    release,
    releaseAll,
    releaseByTaskId,

    TYPE_APP_TASK_PREFIX: 'app_task_',
    TYPE_APP_BACKUP_PREFIX: 'app_backup_',
    TYPE_BOX_UPDATE: 'box_update', // for the actual update and after the backup. this allows the backup before update do not block
    TYPE_BOX_UPDATE_TASK: 'box_update_task', // for scheduling the update task
    TYPE_FULL_BACKUP_TASK_PREFIX: 'full_backup_task_', // for scheduling the backup task

    TYPE_MAIL_SERVER_RESTART: 'mail_restart',
};

const assert = require('node:assert'),
    BoxError = require('./boxerror.js'),
    database = require('./database.js'),
    debug = require('debug')('box:locks'),
    promiseRetry = require('./promise-retry.js');

let gTaskId = null;

function setTaskId(taskId) {
    assert.strictEqual(typeof taskId, 'string');
    gTaskId = taskId;
}

async function read() {
    const result = await database.query('SELECT version, dataJson FROM locks');
    return { version: result[0].version, data: JSON.parse(result[0].dataJson) };
}

async function write(value) {
    assert.strictEqual(typeof value.version, 'number');
    assert.strictEqual(typeof value.data, 'object');

    const result = await database.query('UPDATE locks SET dataJson=?, version=version+1 WHERE id=? AND version=?', [ JSON.stringify(value.data), 'platform', value.version ]);
    if (result.affectedRows !== 1) throw new BoxError(BoxError.CONFLICT, 'Someone updated before we did');
    debug(`write: current locks: ${JSON.stringify(value.data)}`);
}

function canAcquire(data, type) {
    assert.strictEqual(typeof data, 'object');
    assert.strictEqual(typeof type, 'string');

    if (type in data) return new BoxError(BoxError.BAD_STATE, `Locked by ${data[type]}`);

    if (type === exports.TYPE_BOX_UPDATE) {
        if (Object.keys(data).some(k => k.startsWith(exports.TYPE_APP_TASK_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more app tasks are active');
        if (Object.keys(data).some(k => k.startsWith(exports.TYPE_APP_BACKUP_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more app backups are active');
    } else if (type.startsWith(exports.TYPE_APP_TASK_PREFIX)) {
        if (exports.TYPE_BOX_UPDATE in data) return new BoxError(BoxError.BAD_STATE, 'Update is active');
    } else if (type.startsWith(exports.TYPE_FULL_BACKUP_TASK_PREFIX)) {
        if (exports.TYPE_BOX_UPDATE_TASK in data) return new BoxError(BoxError.BAD_STATE, 'Update task is active');
    } else if (type === exports.TYPE_BOX_UPDATE_TASK) {
        if (Object.keys(data).some(k => k.startsWith(exports.TYPE_FULL_BACKUP_TASK_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more backup tasks is active');
    }

    // TYPE_APP_BACKUP_PREFIX , TYPE_MAIL_SERVER_RESTART can co-run with everything except themselves

    return null;
}

async function acquire(type) {
    assert.strictEqual(typeof type, 'string');

    await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {
        const { version, data } = await read();
        const error = canAcquire(data, type);
        if (error) throw error;
        data[type] = gTaskId;
        await write({ version, data });
        debug(`acquire: ${type}`);
    });
}

async function wait(type) {
    assert.strictEqual(typeof type, 'string');

    await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 10000, debug }, async () => await acquire(type));
}

async function release(type) {
    assert.strictEqual(typeof type, 'string');

    await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {
        const { version, data } = await read();
        if (!(type in data)) throw new BoxError(BoxError.BAD_STATE, `Lock ${type} was never acquired`);
        if (data[type] !== gTaskId) throw new BoxError(BoxError.BAD_STATE, `Task ${gTaskId} attempted to release lock ${type} acquired by ${data[type]}`);
        delete data[type];
        await write({ version, data });
        debug(`release: ${type}`);
    });
}

async function releaseAll() {
    await database.query('DELETE FROM locks');
    await database.query('INSERT INTO locks (id, dataJson) VALUES (?, ?)', [ 'platform', JSON.stringify({}) ]);
    debug('releaseAll: all locks released');
}

// identify programming errors in tasks that forgot to clean up locks
async function releaseByTaskId(taskId) {
    assert.strictEqual(typeof taskId, 'string');

    await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {
        const { version, data } = await read();

        for (const type of Object.keys(data)) {
            if (data[type] === taskId) {
                debug(`releaseByTaskId: task ${taskId} forgot to unlock ${type}`);
                delete data[type];
            }
        }

        await write({ version, data });

        debug(`releaseByTaskId: ${taskId}`);
    });
}
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`'use strict';`

			`exports = module.exports = {`
			`setTaskId,`

			`acquire,`
			`wait,`

			`release,`
			`releaseAll,`
			`releaseByTaskId,`

locks: rename lock types to make it clearer 2025-07-18 13:22:33 +02:00			`TYPE_APP_TASK_PREFIX: 'app_task_',`
apps: backup is not a state anymore this is launched as a separate task 2025-07-18 10:56:52 +02:00			`TYPE_APP_BACKUP_PREFIX: 'app_backup_',`
locks: rename lock types to make it clearer 2025-07-18 13:22:33 +02:00			`TYPE_BOX_UPDATE: 'box_update', // for the actual update and after the backup. this allows the backup before update do not block`
			`TYPE_BOX_UPDATE_TASK: 'box_update_task', // for scheduling the update task`
locks: make full backup use a prefix 2025-07-25 14:46:55 +02:00			`TYPE_FULL_BACKUP_TASK_PREFIX: 'full_backup_task_', // for scheduling the backup task`
mail: use a lock to protect container recreation needs a lock because the cert code also restart mail server from tasks 2024-12-16 22:34:50 +01:00
			`TYPE_MAIL_SERVER_RESTART: 'mail_restart',`
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`};`

use node: prefix for requires mostly because code is being autogenerated by all the AI stuff using this prefix. it's also used in the stack trace. 2025-08-14 11:17:38 +05:30			`const assert = require('node:assert'),`
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`BoxError = require('./boxerror.js'),`
			`database = require('./database.js'),`
			`debug = require('debug')('box:locks'),`
			`promiseRetry = require('./promise-retry.js');`

			`let gTaskId = null;`

			`function setTaskId(taskId) {`
			`assert.strictEqual(typeof taskId, 'string');`
			`gTaskId = taskId;`
			`}`

			`async function read() {`
			`const result = await database.query('SELECT version, dataJson FROM locks');`
			`return { version: result[0].version, data: JSON.parse(result[0].dataJson) };`
			`}`

			`async function write(value) {`
			`assert.strictEqual(typeof value.version, 'number');`
			`assert.strictEqual(typeof value.data, 'object');`

			`const result = await database.query('UPDATE locks SET dataJson=?, version=version+1 WHERE id=? AND version=?', [ JSON.stringify(value.data), 'platform', value.version ]);`
			`if (result.affectedRows !== 1) throw new BoxError(BoxError.CONFLICT, 'Someone updated before we did');`
			debug(`write: current locks: ${JSON.stringify(value.data)}`);
			`}`

			`function canAcquire(data, type) {`
			`assert.strictEqual(typeof data, 'object');`
			`assert.strictEqual(typeof type, 'string');`

locks: make full backup use a prefix 2025-07-25 14:46:55 +02:00			if (type in data) return new BoxError(BoxError.BAD_STATE, `Locked by ${data[type]}`);

locks: rename lock types to make it clearer 2025-07-18 13:22:33 +02:00			`if (type === exports.TYPE_BOX_UPDATE) {`
apps: backup is not a state anymore this is launched as a separate task 2025-07-18 10:56:52 +02:00			`if (Object.keys(data).some(k => k.startsWith(exports.TYPE_APP_TASK_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more app tasks are active');`
			`if (Object.keys(data).some(k => k.startsWith(exports.TYPE_APP_BACKUP_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more app backups are active');`
locks: rename lock types to make it clearer 2025-07-18 13:22:33 +02:00			`} else if (type.startsWith(exports.TYPE_APP_TASK_PREFIX)) {`
			`if (exports.TYPE_BOX_UPDATE in data) return new BoxError(BoxError.BAD_STATE, 'Update is active');`
locks: make full backup use a prefix 2025-07-25 14:46:55 +02:00			`} else if (type.startsWith(exports.TYPE_FULL_BACKUP_TASK_PREFIX)) {`
locks: rename lock types to make it clearer 2025-07-18 13:22:33 +02:00			`if (exports.TYPE_BOX_UPDATE_TASK in data) return new BoxError(BoxError.BAD_STATE, 'Update task is active');`
			`} else if (type === exports.TYPE_BOX_UPDATE_TASK) {`
locks: make full backup use a prefix 2025-07-25 14:46:55 +02:00			`if (Object.keys(data).some(k => k.startsWith(exports.TYPE_FULL_BACKUP_TASK_PREFIX))) return new BoxError(BoxError.BAD_STATE, 'One or more backup tasks is active');`
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`}`

locks: make full backup use a prefix 2025-07-25 14:46:55 +02:00			`// TYPE_APP_BACKUP_PREFIX , TYPE_MAIL_SERVER_RESTART can co-run with everything except themselves`
apps: backup is not a state anymore this is launched as a separate task 2025-07-18 10:56:52 +02:00
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`return null;`
			`}`

			`async function acquire(type) {`
			`assert.strictEqual(typeof type, 'string');`

			`await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {`
			`const { version, data } = await read();`
			`const error = canAcquire(data, type);`
			`if (error) throw error;`
			`data[type] = gTaskId;`
			`await write({ version, data });`
			debug(`acquire: ${type}`);
			`});`
			`}`

			`async function wait(type) {`
			`assert.strictEqual(typeof type, 'string');`

			`await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 10000, debug }, async () => await acquire(type));`
			`}`

			`async function release(type) {`
			`assert.strictEqual(typeof type, 'string');`

			`await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {`
			`const { version, data } = await read();`
			if (!(type in data)) throw new BoxError(BoxError.BAD_STATE, `Lock ${type} was never acquired`);
			if (data[type] !== gTaskId) throw new BoxError(BoxError.BAD_STATE, `Task ${gTaskId} attempted to release lock ${type} acquired by ${data[type]}`);
			`delete data[type];`
			`await write({ version, data });`
			debug(`release: ${type}`);
			`});`
			`}`

			`async function releaseAll() {`
			`await database.query('DELETE FROM locks');`
			`await database.query('INSERT INTO locks (id, dataJson) VALUES (?, ?)', [ 'platform', JSON.stringify({}) ]);`
			`debug('releaseAll: all locks released');`
			`}`

clean up task locks 2025-07-18 18:11:56 +02:00			`// identify programming errors in tasks that forgot to clean up locks`
remove global lock Currently, the update/apptask/fullbackup/platformstart take a global lock and cannot run in parallel. This causes situations where when a user tries to trigger an apptask, it says "waiting for backup to finish..." etc The solution is to let them run in parallel. We need a lock at the app level as app operations running in parallel would be bad (tm). In addition, the update task needs a lock just for the update part. We also need multi-process locks. Running tasks as processes is core to our "kill" strategy. Various inter process locks were explored: * node's IPC mechanism with process.send(). But this only works for direct node.js children. taskworker is run via sudo and the IPC does not work. * File lock using O_EXCL. Basic ideas to create lock files. While file creation can be done atomically, it becomes complicated to clean up lock files when the tasks crash. We need a way to know what locks were held by the crashing task. flock and friends are not built-into node.js * sqlite/redis were options but introduce additional deps * Settled on MySQL based locking. Initial plan was to have row locks or table locks. Each row is a kind of lock. While implementing, it was found that we need many types of locks (and not just update lock and app locks). For example, we need locks for each task type, so that only one task type is active at a time. * Instead of rows, we can just lock table and have a json blob in it. This hit a road block that LOCK TABLE is per session and our db layer cannot handle this easily! i.e when issing two db.query() it might use two different connections from the pool. We have to expose the connection, release connection etc. * Next idea was atomic blob update of the blob checking if old blob was same. This approach, was finally refined into a version field. Phew! 2024-12-07 14:35:45 +01:00			`async function releaseByTaskId(taskId) {`
			`assert.strictEqual(typeof taskId, 'string');`

			`await promiseRetry({ times: Number.MAX_SAFE_INTEGER, interval: 100, debug, retry: (error) => error.reason === BoxError.CONFLICT }, async () => {`
			`const { version, data } = await read();`

			`for (const type of Object.keys(data)) {`
			`if (data[type] === taskId) {`
			debug(`releaseByTaskId: task ${taskId} forgot to unlock ${type}`);
			`delete data[type];`
			`}`
			`}`

			`await write({ version, data });`

			debug(`releaseByTaskId: ${taskId}`);
			`});`
			`}`