'use strict'; const appdb = require('./appdb.js'), apps = require('./apps.js'), assert = require('assert'), async = require('async'), auditSource = require('./auditsource.js'), BoxError = require('./boxerror.js'), constants = require('./constants.js'), debug = require('debug')('box:apphealthmonitor'), docker = require('./docker.js'), eventlog = require('./eventlog.js'), safe = require('safetydance'), superagent = require('superagent'); exports = module.exports = { run }; const HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable const UNHEALTHY_THRESHOLD = 20 * 60 * 1000; // 20 minutes const OOM_EVENT_LIMIT = 60 * 60 * 1000; // 60 minutes let gStartTime = null; // time when apphealthmonitor was started let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago function setHealth(app, health, callback) { assert.strictEqual(typeof app, 'object'); assert.strictEqual(typeof health, 'string'); assert.strictEqual(typeof callback, 'function'); // app starts out with null health // if it became healthy, we update immediately. this is required for ui to say "running" etc // if it became unhealthy/error/dead, wait for a threshold before updating db const now = new Date(), lastHealth = app.health; let healthTime = gStartTime > app.healthTime ? gStartTime : app.healthTime; // on box restart, clamp value to start time if (health === apps.HEALTH_HEALTHY) { healthTime = now; if (lastHealth && lastHealth !== apps.HEALTH_HEALTHY) { // app starts out with null health debug(`setHealth: ${app.id} (${app.fqdn}) switched from ${lastHealth} to healthy`); // do not send mails for dev apps if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_UP, auditSource.HEALTH_MONITOR, { app: app }); } } else if (Math.abs(now - healthTime) > UNHEALTHY_THRESHOLD) { if (lastHealth === apps.HEALTH_HEALTHY) { debug(`setHealth: marking ${app.id} (${app.fqdn}) as unhealthy since not seen for more than ${UNHEALTHY_THRESHOLD/(60 * 1000)} minutes`); // do not send mails for dev apps if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_DOWN, auditSource.HEALTH_MONITOR, { app: app }); } } else { debug(`setHealth: ${app.id} (${app.fqdn}) waiting for ${(UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000} to update health`); return callback(null); } appdb.setHealth(app.id, health, healthTime, function (error) { if (error && error.reason === BoxError.NOT_FOUND) return callback(null); // app uninstalled? if (error) return callback(error); app.health = health; app.healthTime = healthTime; callback(null); }); } // callback is called with error for fatal errors and not if health check failed function checkAppHealth(app, callback) { assert.strictEqual(typeof app, 'object'); assert.strictEqual(typeof callback, 'function'); if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) { return callback(null); } const manifest = app.manifest; docker.inspect(app.containerId, function (error, data) { if (error || !data || !data.State) return setHealth(app, apps.HEALTH_ERROR, callback); if (data.State.Running !== true) return setHealth(app, apps.HEALTH_DEAD, callback); // non-appstore apps may not have healthCheckPath if (!manifest.healthCheckPath) return setHealth(app, apps.HEALTH_HEALTHY, callback); const healthCheckUrl = `http://${app.containerIp}:${manifest.httpPort}${manifest.healthCheckPath}`; superagent .get(healthCheckUrl) .set('Host', app.fqdn) // required for some apache configs with rewrite rules .set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio) .redirects(0) .timeout(HEALTHCHECK_INTERVAL) .end(function (error, res) { if (error && !error.response) { setHealth(app, apps.HEALTH_UNHEALTHY, callback); } else if (res.statusCode > 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites) setHealth(app, apps.HEALTH_UNHEALTHY, callback); } else { setHealth(app, apps.HEALTH_HEALTHY, callback); } }); }); } function getContainerInfo(containerId, callback) { docker.inspect(containerId, function (error, result) { if (error) return callback(error); const appId = safe.query(result, 'Config.Labels.appId', null); if (!appId) return callback(null, null /* app */, { name: result.Name }); // addon apps.get(appId, callback); // don't get by container id as this can be an exec container }); } /* OOM can be tested using stress tool like so: docker run -ti -m 100M cloudron/base:2.0.0 /bin/bash apt-get update && apt-get install stress stress --vm 1 --vm-bytes 200M --vm-hang 0 */ function processDockerEvents(intervalSecs, callback) { assert.strictEqual(typeof intervalSecs, 'number'); assert.strictEqual(typeof callback, 'function'); const since = ((new Date().getTime() / 1000) - intervalSecs).toFixed(0); const until = ((new Date().getTime() / 1000) - 1).toFixed(0); docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) { if (error) return callback(error); stream.setEncoding('utf8'); stream.on('data', function (data) { const event = JSON.parse(data); const containerId = String(event.id); getContainerInfo(containerId, function (error, app, addon) { const program = error ? containerId : (app ? app.fqdn : addon.name); const now = Date.now(); const notifyUser = !(app && app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT); debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now); // do not send mails for dev apps if (notifyUser) { // app can be null for addon containers eventlog.add(eventlog.ACTION_APP_OOM, auditSource.HEALTH_MONITOR, { event: event, containerId: containerId, addon: addon || null, app: app || null }); gLastOomMailTime = now; } }); }); stream.on('error', function (error) { debug('Error reading docker events', error); callback(); }); stream.on('end', callback); // safety hatch if 'until' doesn't work (there are cases where docker is working with a different time) setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179 }); } function processApp(callback) { assert.strictEqual(typeof callback, 'function'); apps.getAll(function (error, allApps) { if (error) return callback(error); async.each(allApps, checkAppHealth, function (error) { const alive = allApps .filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; }); debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.` + (error ? ` ${error.reason}` : '')); callback(null); }); }); } function run(intervalSecs, callback) { assert.strictEqual(typeof intervalSecs, 'number'); assert.strictEqual(typeof callback, 'function'); if (constants.TEST) return; if (!gStartTime) gStartTime = new Date(); async.series([ processApp, // this is first because docker.getEvents seems to get 'stuck' sometimes processDockerEvents.bind(null, intervalSecs) ], function (error) { if (error) debug(`run: could not check app health. ${error.message}`); callback(); }); }