Files
cloudron-box/src/apphealthmonitor.js
T
Girish Ramakrishnan f2c8102a6a track last oom time using a global variable
because it was a local variable, we were just sending out oom mails
like crazy

also, fixes an issue that if docker.getEvents gets stuck because
docker does not respond then we do not do any health monitoring.
i guess this can happen if the docker API gets stuck.

(cherry picked from commit a536e9fc4b)
2018-12-16 21:07:14 -08:00

198 lines
7.4 KiB
JavaScript

'use strict';
var appdb = require('./appdb.js'),
apps = require('./apps.js'),
assert = require('assert'),
async = require('async'),
DatabaseError = require('./databaseerror.js'),
debug = require('debug')('box:apphealthmonitor'),
docker = require('./docker.js').connection,
mailer = require('./mailer.js'),
superagent = require('superagent'),
util = require('util');
exports = module.exports = {
run: run
};
const HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable
const UNHEALTHY_THRESHOLD = 10 * 60 * 1000; // 10 minutes
let gHealthInfo = { }; // { time, emailSent }
const OOM_MAIL_LIMIT = 60 * 60 * 1000; // 60 minutes
let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago
function debugApp(app) {
assert(typeof app === 'object');
debug(app.fqdn + ' ' + app.manifest.id + ' ' + util.format.apply(util, Array.prototype.slice.call(arguments, 1)) + ' - ' + app.id);
}
function setHealth(app, health, callback) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof health, 'string');
assert.strictEqual(typeof callback, 'function');
var now = new Date();
if (!(app.id in gHealthInfo)) { // add new apps to list
gHealthInfo[app.id] = { time: now, emailSent: false };
}
if (health === appdb.HEALTH_HEALTHY) {
gHealthInfo[app.id].time = now;
} else if (Math.abs(now - gHealthInfo[app.id].time) > UNHEALTHY_THRESHOLD) {
if (gHealthInfo[app.id].emailSent) return callback(null);
debugApp(app, 'marking as unhealthy since not seen for more than %s minutes', UNHEALTHY_THRESHOLD/(60 * 1000));
if (!app.debugMode) mailer.appDied(app); // do not send mails for dev apps
gHealthInfo[app.id].emailSent = true;
} else {
debugApp(app, 'waiting for sometime to update the app health');
return callback(null);
}
appdb.setHealth(app.id, health, function (error) {
if (error && error.reason === DatabaseError.NOT_FOUND) return callback(null); // app uninstalled?
if (error) return callback(error);
app.health = health;
callback(null);
});
}
// callback is called with error for fatal errors and not if health check failed
function checkAppHealth(app, callback) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof callback, 'function');
if (app.installationState !== appdb.ISTATE_INSTALLED || app.runState !== appdb.RSTATE_RUNNING) {
debugApp(app, 'skipped. istate:%s rstate:%s', app.installationState, app.runState);
return callback(null);
}
var container = docker.getContainer(app.containerId),
manifest = app.manifest;
container.inspect(function (err, data) {
if (err || !data || !data.State) {
debugApp(app, 'Error inspecting container');
return setHealth(app, appdb.HEALTH_ERROR, callback);
}
if (data.State.Running !== true) {
debugApp(app, 'exited');
return setHealth(app, appdb.HEALTH_DEAD, callback);
}
// non-appstore apps may not have healthCheckPath
if (!manifest.healthCheckPath) return setHealth(app, appdb.HEALTH_HEALTHY, callback);
// poll through docker network instead of nginx to bypass any potential oauth proxy
var healthCheckUrl = 'http://127.0.0.1:' + app.httpPort + manifest.healthCheckPath;
superagent
.get(healthCheckUrl)
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
.set('User-Agent', 'Mozilla') // required for some apps (e.g. minio)
.redirects(0)
.timeout(HEALTHCHECK_INTERVAL)
.end(function (error, res) {
if (error && !error.response) {
debugApp(app, 'not alive (network error): %s', error.message);
setHealth(app, appdb.HEALTH_UNHEALTHY, callback);
} else if (res.statusCode >= 400) { // 2xx and 3xx are ok
debugApp(app, 'not alive : %s', error || res.status);
setHealth(app, appdb.HEALTH_UNHEALTHY, callback);
} else {
setHealth(app, appdb.HEALTH_HEALTHY, callback);
}
});
});
}
/*
OOM can be tested using stress tool like so:
docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash
apt-get update && apt-get install stress
stress --vm 1 --vm-bytes 200M --vm-hang 0
*/
function processDockerEvents(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
assert.strictEqual(typeof callback, 'function');
const since = ((new Date().getTime() / 1000) - intervalSecs).toFixed(0);
const until = ((new Date().getTime() / 1000) - 1).toFixed(0);
docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) {
if (error) return callback(error);
stream.setEncoding('utf8');
stream.on('data', function (data) {
var ev = JSON.parse(data);
appdb.getByContainerId(ev.id, function (error, app) { // this can error for addons
var program = error || !app.appStoreId ? ev.id : app.appStoreId;
var context = JSON.stringify(ev);
var now = Date.now();
if (app) context = context + '\n\n' + JSON.stringify(app, null, 4) + '\n';
const notifyUser = (!app || !app.debugMode) && (now - gLastOomMailTime > OOM_MAIL_LIMIT);
debug('OOM Context: %s. notifyUser: %s. lastOomTime: %s (now: %s)', context, notifyUser, gLastOomMailTime, now);
// do not send mails for dev apps
if (notifyUser) {
mailer.oomEvent(program, context); // app can be null if it's an addon crash
gLastOomMailTime = now;
}
});
});
stream.on('error', function (error) {
debug('Error reading docker events', error);
callback();
});
stream.on('end', callback);
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179
});
}
function processApp(callback) {
assert.strictEqual(typeof callback, 'function');
apps.getAll(function (error, result) {
if (error) return callback(error);
async.each(result, checkAppHealth, function (error) {
if (error) console.error(error);
var alive = result
.filter(function (a) { return a.installationState === appdb.ISTATE_INSTALLED && a.runState === appdb.RSTATE_RUNNING && a.health === appdb.HEALTH_HEALTHY; })
.map(function (a) { return (a.location || 'naked_domain') + '|' + a.manifest.id; }).join(', ');
debug('apps alive: [%s]', alive);
callback(null);
});
});
}
function run(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
assert.strictEqual(typeof callback, 'function');
async.series([
processApp, // this is first because docker.getEvents seems to get 'stuck' sometimes
processDockerEvents.bind(null, intervalSecs)
], function (error) {
if (error) debug(error);
callback();
});
}