Files
cloudron-box/src/apphealthmonitor.js
T
Girish Ramakrishnan 86916a94de allow 401 and 403 errors to pass health check
way too many WP sites use some plugin to block health check routes.
maybe some day we will have dynamic health check route settable by user.
2020-11-10 16:50:36 -08:00

200 lines
7.8 KiB
JavaScript

'use strict';
var appdb = require('./appdb.js'),
apps = require('./apps.js'),
assert = require('assert'),
async = require('async'),
auditSource = require('./auditsource.js'),
BoxError = require('./boxerror.js'),
debug = require('debug')('box:apphealthmonitor'),
docker = require('./docker.js'),
eventlog = require('./eventlog.js'),
safe = require('safetydance'),
superagent = require('superagent'),
util = require('util');
exports = module.exports = {
run
};
const HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable
const UNHEALTHY_THRESHOLD = 10 * 60 * 1000; // 10 minutes
const OOM_EVENT_LIMIT = 60 * 60 * 1000; // 60 minutes
let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago
function debugApp(app) {
assert(typeof app === 'object');
debug(app.fqdn + ' ' + util.format.apply(util, Array.prototype.slice.call(arguments, 1)) + ' - ' + app.id);
}
function setHealth(app, health, callback) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof health, 'string');
assert.strictEqual(typeof callback, 'function');
let now = new Date(), healthTime = app.healthTime, curHealth = app.health;
if (health === apps.HEALTH_HEALTHY) {
healthTime = now;
if (curHealth && curHealth !== apps.HEALTH_HEALTHY) { // app starts out with null health
debugApp(app, 'app switched from %s to healthy', curHealth);
// do not send mails for dev apps
if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_UP, auditSource.HEALTH_MONITOR, { app: app });
}
} else if (Math.abs(now - healthTime) > UNHEALTHY_THRESHOLD) {
if (curHealth === apps.HEALTH_HEALTHY) {
debugApp(app, 'marking as unhealthy since not seen for more than %s minutes', UNHEALTHY_THRESHOLD/(60 * 1000));
// do not send mails for dev apps
if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_DOWN, auditSource.HEALTH_MONITOR, { app: app });
}
} else {
debugApp(app, 'waiting for %s seconds to update the app health', (UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000);
return callback(null);
}
appdb.setHealth(app.id, health, healthTime, function (error) {
if (error && error.reason === BoxError.NOT_FOUND) return callback(null); // app uninstalled?
if (error) return callback(error);
app.health = health;
callback(null);
});
}
// callback is called with error for fatal errors and not if health check failed
function checkAppHealth(app, callback) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof callback, 'function');
if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) {
return callback(null);
}
const manifest = app.manifest;
docker.inspect(app.containerId, function (error, data) {
if (error || !data || !data.State) return setHealth(app, apps.HEALTH_ERROR, callback);
if (data.State.Running !== true) return setHealth(app, apps.HEALTH_DEAD, callback);
// non-appstore apps may not have healthCheckPath
if (!manifest.healthCheckPath) return setHealth(app, apps.HEALTH_HEALTHY, callback);
// poll through docker network instead of nginx to bypass any potential oauth proxy
var healthCheckUrl = 'http://127.0.0.1:' + app.httpPort + manifest.healthCheckPath;
superagent
.get(healthCheckUrl)
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
.set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio)
.redirects(0)
.timeout(HEALTHCHECK_INTERVAL)
.end(function (error, res) {
if (error && !error.response) {
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
} else if (res.statusCode >= 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites)
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
} else {
setHealth(app, apps.HEALTH_HEALTHY, callback);
}
});
});
}
function getContainerInfo(containerId, callback) {
docker.inspect(containerId, function (error, result) {
if (error) return callback(error);
const appId = safe.query(result, 'Config.Labels.appId', null);
if (!appId) return callback(null, null /* app */, { name: result.Name }); // addon
apps.get(appId, callback); // don't get by container id as this can be an exec container
});
}
/*
OOM can be tested using stress tool like so:
docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash
apt-get update && apt-get install stress
stress --vm 1 --vm-bytes 200M --vm-hang 0
*/
function processDockerEvents(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
assert.strictEqual(typeof callback, 'function');
const since = ((new Date().getTime() / 1000) - intervalSecs).toFixed(0);
const until = ((new Date().getTime() / 1000) - 1).toFixed(0);
docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) {
if (error) return callback(error);
stream.setEncoding('utf8');
stream.on('data', function (data) {
const event = JSON.parse(data);
const containerId = String(event.id);
getContainerInfo(containerId, function (error, app, addon) {
const program = error ? containerId : (app ? app.fqdn : addon.name);
const now = Date.now();
const notifyUser = !(app && app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT);
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now);
// do not send mails for dev apps
if (notifyUser) {
// app can be null for addon containers
eventlog.add(eventlog.ACTION_APP_OOM, auditSource.HEALTH_MONITOR, { event: event, containerId: containerId, addon: addon || null, app: app || null });
gLastOomMailTime = now;
}
});
});
stream.on('error', function (error) {
debug('Error reading docker events', error);
callback();
});
stream.on('end', callback);
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179
});
}
function processApp(callback) {
assert.strictEqual(typeof callback, 'function');
apps.getAll(function (error, allApps) {
if (error) return callback(error);
async.each(allApps, checkAppHealth, function (error) {
const alive = allApps
.filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; });
debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.` + (error ? ` ${error.reason}` : ''));
callback(null);
});
});
}
function run(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
assert.strictEqual(typeof callback, 'function');
async.series([
processApp, // this is first because docker.getEvents seems to get 'stuck' sometimes
processDockerEvents.bind(null, intervalSecs)
], function (error) {
if (error) debug(`run: could not check app health. ${error.message}`);
callback();
});
}