diff --git a/src/cloudron.js b/src/cloudron.js index 0e835a69e..541a9acd4 100644 --- a/src/cloudron.js +++ b/src/cloudron.js @@ -152,16 +152,8 @@ async function runStartupTasks() { // we used to run tasks in parallel but simultaneous nginx reloads was causing issues for (let i = 0; i < tasks.length; i++) { - for (let attempt = 0; attempt < 3; attempt++) { - const [error] = await safe(tasks[i]()); - if (!error) break; // task succeeded - debug(`Startup task at index ${i} failed (attempt ${attempt}): ${error.message}`); - // for some reason, mysql arbitrary restarts making startup tasks fail. this makes the box update stuck - const retry = error.reason === BoxError.DATABASE_ERROR && error.code === 'PROTOCOL_CONNECTION_LOST'; - if (!retry) break; - debug(`Will retry task at index ${i}`); - await delay(3000); - } + const [error] = await safe(tasks[i]()); + if (error) debug(`Startup task at index ${i} failed: ${error.message}`); } } diff --git a/src/platform.js b/src/platform.js index ea45c2594..f9f371740 100644 --- a/src/platform.js +++ b/src/platform.js @@ -10,7 +10,9 @@ exports = module.exports = { const apps = require('./apps.js'), assert = require('assert'), + BoxError = require('./boxerror.js'), debug = require('debug')('box:platform'), + delay = require('delay'), fs = require('fs'), infra = require('./infra_version.js'), locker = require('./locker.js'), @@ -37,9 +39,7 @@ async function start(options) { // short-circuit for the restart case if (_.isEqual(infra, existingInfra)) { debug('platform is uptodate at version %s', infra.version); - - onPlatformReady(false /* !infraChanged */); - + await onPlatformReady(false /* !infraChanged */); return; } @@ -48,15 +48,27 @@ async function start(options) { const error = locker.lock(locker.OP_PLATFORM_START); if (error) throw error; - if (existingInfra.version !== infra.version) await removeAllContainers(); - if (existingInfra.version === 'none') await volumes.mountAll(); // when restoring, mount all volumes - await markApps(existingInfra, options); // mark app state before we start addons. this gives the db import logic a chance to mark an app as errored - await services.startServices(existingInfra); - await fs.promises.writeFile(paths.INFRA_VERSION_FILE, JSON.stringify(infra, null, 4)); + for (let attempt = 0; attempt < 5; attempt++) { + try { + if (existingInfra.version !== infra.version) await removeAllContainers(); + if (existingInfra.version === 'none') await volumes.mountAll(); // when restoring, mount all volumes + await markApps(existingInfra, options); // mark app state before we start addons. this gives the db import logic a chance to mark an app as errored + await services.startServices(existingInfra); + await fs.promises.writeFile(paths.INFRA_VERSION_FILE, JSON.stringify(infra, null, 4)); + break; + } catch (error) { + // for some reason, mysql arbitrary restarts making startup tasks fail. this makes the box update stuck + // LOST is when existing connection breaks. REFUSED is when new connection cannot connect at all + const retry = error.reason === BoxError.DATABASE_ERROR && (error.code === 'PROTOCOL_CONNECTION_LOST' || error.code === 'ECONNREFUSED'); + debug(`Failed to start services. retry=${retry} (attempt ${attempt}): ${error.message}`); + if (!retry) break; + await delay(10000); + } + } locker.unlock(locker.OP_PLATFORM_START); - onPlatformReady(true /* infraChanged */); // background + await onPlatformReady(true /* infraChanged */); } async function stopAllTasks() { @@ -67,7 +79,7 @@ async function onPlatformReady(infraChanged) { debug(`onPlatformReady: platform is ready. infra changed: ${infraChanged}`); exports._isReady = true; - if (infraChanged) await pruneInfraImages(); + if (infraChanged) await safe(pruneInfraImages(), { debug }); // ignore error await apps.schedulePendingTasks(); }