diff --git a/box.js b/box.js index 71ba1841b..d34328e7b 100755 --- a/box.js +++ b/box.js @@ -9,8 +9,7 @@ require('debug').formatArgs = function formatArgs(args) { args[0] = this.namespace + ' ' + args[0]; }; -var appHealthMonitor = require('./src/apphealthmonitor.js'), - async = require('async'), +let async = require('async'), config = require('./src/config.js'), ldap = require('./src/ldap.js'), dockerProxy = require('./src/dockerproxy.js'), @@ -36,8 +35,7 @@ console.log(); async.series([ server.start, ldap.start, - dockerProxy.start, - appHealthMonitor.start, + dockerProxy.start ], function (error) { if (error) { console.error('Error starting server', error); diff --git a/src/apphealthmonitor.js b/src/apphealthmonitor.js index fb7c0987d..a5877f2ac 100644 --- a/src/apphealthmonitor.js +++ b/src/apphealthmonitor.js @@ -12,15 +12,14 @@ var appdb = require('./appdb.js'), util = require('util'); exports = module.exports = { - start: start, - stop: stop + run: run }; var HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable var UNHEALTHY_THRESHOLD = 10 * 60 * 1000; // 10 minutes var gHealthInfo = { }; // { time, emailSent } -var gRunTimeout = null; -var gDockerEventStream = null; + +const NOOP_CALLBACK = function (error) { if (error) console.error(error); }; function debugApp(app) { assert(typeof app === 'object'); @@ -110,48 +109,23 @@ function checkAppHealth(app, callback) { }); } -function processApps(callback) { - apps.getAll(function (error, result) { - if (error) return callback(error); - - async.each(result, checkAppHealth, function (error) { - if (error) console.error(error); - - var alive = result - .filter(function (a) { return a.installationState === appdb.ISTATE_INSTALLED && a.runState === appdb.RSTATE_RUNNING && a.health === appdb.HEALTH_HEALTHY; }) - .map(function (a) { return (a.location || 'naked_domain') + '|' + a.manifest.id; }).join(', '); - - debug('apps alive: [%s]', alive); - - callback(null); - }); - }); -} - -function run() { - processApps(function (error) { - if (error) console.error(error); - - gRunTimeout = setTimeout(run, HEALTHCHECK_INTERVAL); - }); -} - /* OOM can be tested using stress tool like so: docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash apt-get update && apt-get install stress stress --vm 1 --vm-bytes 200M --vm-hang 0 */ -function processDockerEvents() { - // note that for some reason, the callback is called only on the first event - debug('Listening for docker events'); +function processDockerEvents(interval, callback) { + assert.strictEqual(typeof interval, 'number'); + assert.strictEqual(typeof callback, 'function'); + const OOM_MAIL_LIMIT = 60 * 60 * 1000; // 60 minutes - var lastOomMailTime = new Date(new Date() - OOM_MAIL_LIMIT); + let lastOomMailTime = new Date(new Date() - OOM_MAIL_LIMIT); + const since = ((new Date().getTime() / 1000) - interval).toFixed(0); + const until = ((new Date().getTime() / 1000) - 1).toFixed(0); - docker.getEvents({ filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) { - if (error) return console.error(error); - - gDockerEventStream = stream; + docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) { + if (error) return callback(error); stream.setEncoding('utf8'); stream.on('data', function (data) { @@ -173,34 +147,48 @@ function processDockerEvents() { }); stream.on('error', function (error) { - console.error('Error reading docker events', error); - gDockerEventStream = null; // TODO: reconnect? + debug('Error reading docker events', error); + callback(); }); - stream.on('end', function () { - console.error('Docker event stream ended'); - gDockerEventStream = null; // TODO: reconnect? + stream.on('end', callback); + + // safety hatch if 'until' doesn't work (there are cases where docker is working with a different time) + setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179 + }); +} + +function processApp(callback) { + assert.strictEqual(typeof callback, 'function'); + + apps.getAll(function (error, result) { + if (error) return callback(error); + + async.each(result, checkAppHealth, function (error) { + if (error) console.error(error); + + var alive = result + .filter(function (a) { return a.installationState === appdb.ISTATE_INSTALLED && a.runState === appdb.RSTATE_RUNNING && a.health === appdb.HEALTH_HEALTHY; }) + .map(function (a) { return (a.location || 'naked_domain') + '|' + a.manifest.id; }).join(', '); + + debug('apps alive: [%s]', alive); + + callback(null); }); }); } -function start(callback) { - assert.strictEqual(typeof callback, 'function'); +function run(interval, callback) { + assert.strictEqual(typeof interval, 'number'); - debug('Starting apphealthmonitor'); + callback = callback || NOOP_CALLBACK; - processDockerEvents(); + async.series([ + processDockerEvents.bind(null, interval), + processApp + ], function (error) { + if (error) debug(error); - run(); - - callback(); -} - -function stop(callback) { - assert.strictEqual(typeof callback, 'function'); - - clearTimeout(gRunTimeout); - if (gDockerEventStream) gDockerEventStream.end(); - - callback(); + callback(); + }); } diff --git a/src/cron.js b/src/cron.js index e9fff2ccc..9ea4a34f1 100644 --- a/src/cron.js +++ b/src/cron.js @@ -7,7 +7,8 @@ exports = module.exports = { stopJobs: stopJobs }; -var apps = require('./apps.js'), +var appHealthMonitor = require('./apphealthmonitor.js'), + apps = require('./apps.js'), appstore = require('./appstore.js'), assert = require('assert'), backups = require('./backups.js'), @@ -43,7 +44,8 @@ var gJobs = { digestEmail: null, dockerVolumeCleaner: null, dynamicDNS: null, - schedulerSync: null + schedulerSync: null, + appHealthMonitor: null }; var NOOP_CALLBACK = function (error) { if (error) console.error(error); }; @@ -196,6 +198,14 @@ function recreateJobs(tz) { start: true, timeZone: tz }); + + if (gJobs.appHealthMonitor) gJobs.appHealthMonitor.stop(); + gJobs.appHealthMonitor = new CronJob({ + cronTime: '*/10 * * * * *', // every 10 seconds + onTick: appHealthMonitor.run.bind(null, 10), + start: true, + timeZone: tz + }); } function boxAutoupdatePatternChanged(pattern) { diff --git a/src/routes/test/apps-test.js b/src/routes/test/apps-test.js index 03846889d..0bf631668 100644 --- a/src/routes/test/apps-test.js +++ b/src/routes/test/apps-test.js @@ -15,7 +15,6 @@ var accesscontrol = require('../../accesscontrol.js'), clients = require('../../clients.js'), config = require('../../config.js'), constants = require('../../constants.js'), - apphealthmonitor = require('../../apphealthmonitor.js'), database = require('../../database.js'), docker = require('../../docker.js').connection, expect = require('expect.js'), @@ -251,7 +250,6 @@ function stopBox(done) { // db is not cleaned up here since it's too late to call it after server.stop. if called before server.stop taskmanager apptasks are unhappy :/ async.series([ - apphealthmonitor.stop, taskmanager.stopPendingTasks, taskmanager.waitForPendingTasks, appdb._clear, @@ -643,7 +641,6 @@ describe('App installation', function () { async.series([ startBox, - apphealthmonitor.start, function (callback) { apiHockInstance