Files
cloudron-box/src/apphealthmonitor.js
T

209 lines
7.8 KiB
JavaScript
Raw Normal View History

'use strict';
2015-09-14 10:52:11 -07:00
var appdb = require('./appdb.js'),
apps = require('./apps.js'),
assert = require('assert'),
async = require('async'),
2019-03-25 15:07:06 -07:00
auditSource = require('./auditsource.js'),
BoxError = require('./boxerror.js'),
2015-09-14 10:52:11 -07:00
debug = require('debug')('box:apphealthmonitor'),
2019-03-06 11:54:37 -08:00
docker = require('./docker.js'),
2019-01-17 15:31:34 +01:00
eventlog = require('./eventlog.js'),
2019-03-06 11:54:37 -08:00
safe = require('safetydance'),
superagent = require('superagent'),
util = require('util');
exports = module.exports = {
2018-10-22 11:39:42 -07:00
run: run
};
2018-12-16 20:30:09 -08:00
const HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable
const UNHEALTHY_THRESHOLD = 10 * 60 * 1000; // 10 minutes
2018-10-22 11:39:42 -07:00
2019-03-06 15:55:07 -08:00
const OOM_EVENT_LIMIT = 60 * 60 * 1000; // 60 minutes
2018-12-16 20:30:09 -08:00
let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago
2015-09-14 10:52:11 -07:00
function debugApp(app) {
2018-02-08 15:07:49 +01:00
assert(typeof app === 'object');
2019-11-15 17:28:43 -08:00
debug(app.fqdn + ' ' + util.format.apply(util, Array.prototype.slice.call(arguments, 1)) + ' - ' + app.id);
}
function setHealth(app, health, callback) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof health, 'string');
assert.strictEqual(typeof callback, 'function');
2019-02-12 16:03:12 -08:00
let now = new Date(), healthTime = app.healthTime, curHealth = app.health;
2019-08-30 13:12:49 -07:00
if (health === apps.HEALTH_HEALTHY) {
2019-02-12 16:03:12 -08:00
healthTime = now;
2019-08-30 13:12:49 -07:00
if (curHealth && curHealth !== apps.HEALTH_HEALTHY) { // app starts out with null health
2019-02-12 17:01:45 -08:00
debugApp(app, 'app switched from %s to healthy', curHealth);
2019-02-12 16:03:12 -08:00
// do not send mails for dev apps
2019-03-25 15:07:06 -07:00
if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_UP, auditSource.HEALTH_MONITOR, { app: app });
2019-02-12 16:03:12 -08:00
}
} else if (Math.abs(now - healthTime) > UNHEALTHY_THRESHOLD) {
2019-08-30 13:12:49 -07:00
if (curHealth === apps.HEALTH_HEALTHY) {
2019-02-12 16:03:12 -08:00
debugApp(app, 'marking as unhealthy since not seen for more than %s minutes', UNHEALTHY_THRESHOLD/(60 * 1000));
// do not send mails for dev apps
2019-03-25 15:07:06 -07:00
if (!app.debugMode) eventlog.add(eventlog.ACTION_APP_DOWN, auditSource.HEALTH_MONITOR, { app: app });
2019-02-12 16:03:12 -08:00
}
} else {
2019-02-12 16:03:12 -08:00
debugApp(app, 'waiting for %s seconds to update the app health', (UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000);
return callback(null);
}
2019-02-12 16:03:12 -08:00
appdb.setHealth(app.id, health, healthTime, function (error) {
if (error && error.reason === BoxError.NOT_FOUND) return callback(null); // app uninstalled?
if (error) return callback(error);
app.health = health;
callback(null);
});
}
// callback is called with error for fatal errors and not if health check failed
function checkAppHealth(app, callback) {
2018-02-08 15:07:49 +01:00
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof callback, 'function');
2019-08-30 13:12:49 -07:00
if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) {
return callback(null);
}
2019-03-06 11:54:37 -08:00
const manifest = app.manifest;
2019-03-06 11:54:37 -08:00
docker.inspect(app.containerId, function (error, data) {
if (error || !data || !data.State) {
debugApp(app, 'Error inspecting container');
2019-08-30 13:12:49 -07:00
return setHealth(app, apps.HEALTH_ERROR, callback);
}
if (data.State.Running !== true) {
debugApp(app, 'exited');
2019-08-30 13:12:49 -07:00
return setHealth(app, apps.HEALTH_DEAD, callback);
}
// non-appstore apps may not have healthCheckPath
2019-08-30 13:12:49 -07:00
if (!manifest.healthCheckPath) return setHealth(app, apps.HEALTH_HEALTHY, callback);
// poll through docker network instead of nginx to bypass any potential oauth proxy
var healthCheckUrl = 'http://127.0.0.1:' + app.httpPort + manifest.healthCheckPath;
superagent
.get(healthCheckUrl)
2017-01-17 16:01:10 +01:00
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
2019-07-31 15:45:25 -07:00
.set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio)
.redirects(0)
.timeout(HEALTHCHECK_INTERVAL)
.end(function (error, res) {
2017-10-04 15:08:26 -07:00
if (error && !error.response) {
2019-08-30 13:12:49 -07:00
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
2017-10-04 15:08:26 -07:00
} else if (res.statusCode >= 400) { // 2xx and 3xx are ok
2019-08-30 13:12:49 -07:00
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
2017-10-04 15:08:26 -07:00
} else {
2019-08-30 13:12:49 -07:00
setHealth(app, apps.HEALTH_HEALTHY, callback);
2017-10-04 15:08:26 -07:00
}
});
});
}
2019-03-06 11:54:37 -08:00
function getContainerInfo(containerId, callback) {
docker.inspect(containerId, function (error, result) {
if (error) return callback(error);
const appId = safe.query(result, 'Config.Labels.appId', null);
if (!appId) return callback(null, null /* app */, { name: result.Name }); // addon
apps.get(appId, callback); // don't get by container id as this can be an exec container
});
}
2015-09-14 17:20:30 -07:00
/*
OOM can be tested using stress tool like so:
2017-02-16 09:20:27 -08:00
docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash
2015-09-14 17:20:30 -07:00
apt-get update && apt-get install stress
stress --vm 1 --vm-bytes 200M --vm-hang 0
*/
2018-12-16 20:30:09 -08:00
function processDockerEvents(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
2018-10-22 11:39:42 -07:00
assert.strictEqual(typeof callback, 'function');
2016-07-25 14:19:20 -07:00
2018-12-16 20:30:09 -08:00
const since = ((new Date().getTime() / 1000) - intervalSecs).toFixed(0);
2018-10-22 11:39:42 -07:00
const until = ((new Date().getTime() / 1000) - 1).toFixed(0);
2015-09-14 16:01:37 -07:00
2018-10-22 11:39:42 -07:00
docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) {
if (error) return callback(error);
2015-09-14 16:01:37 -07:00
stream.setEncoding('utf8');
stream.on('data', function (data) {
2019-03-06 11:54:37 -08:00
const event = JSON.parse(data);
const containerId = String(event.id);
2015-09-14 17:06:04 -07:00
2019-03-06 11:54:37 -08:00
getContainerInfo(containerId, function (error, app, addon) {
const program = error ? containerId : (app ? app.fqdn : addon.name);
const now = Date.now();
2019-03-06 15:55:07 -08:00
const notifyUser = !(app && app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT);
2018-12-16 20:30:09 -08:00
2019-03-06 11:54:37 -08:00
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now);
// do not send mails for dev apps
2018-12-16 20:30:09 -08:00
if (notifyUser) {
2019-02-11 14:37:49 -08:00
// app can be null for addon containers
2019-03-25 15:07:06 -07:00
eventlog.add(eventlog.ACTION_APP_OOM, auditSource.HEALTH_MONITOR, { event: event, containerId: containerId, addon: addon || null, app: app || null });
2019-01-17 15:31:34 +01:00
2018-12-16 20:30:09 -08:00
gLastOomMailTime = now;
2016-07-25 14:19:20 -07:00
}
2015-09-14 17:06:04 -07:00
});
2015-09-14 16:01:37 -07:00
});
stream.on('error', function (error) {
2018-10-22 11:39:42 -07:00
debug('Error reading docker events', error);
callback();
2015-09-14 16:01:37 -07:00
});
2018-10-22 11:39:42 -07:00
stream.on('end', callback);
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179
2015-09-14 16:01:37 -07:00
});
}
2018-10-22 11:39:42 -07:00
function processApp(callback) {
2015-09-14 10:52:11 -07:00
assert.strictEqual(typeof callback, 'function');
2020-05-24 11:41:01 -07:00
apps.getAll(function (error, allApps) {
2018-10-22 11:39:42 -07:00
if (error) return callback(error);
2015-09-14 16:01:37 -07:00
2020-05-24 11:41:01 -07:00
async.each(allApps, checkAppHealth, function (error) {
2018-10-22 11:39:42 -07:00
if (error) console.error(error);
2015-09-14 16:01:37 -07:00
2020-05-24 11:41:01 -07:00
const alive = allApps
2020-05-24 12:30:48 -07:00
.filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; });
2015-09-14 16:01:37 -07:00
2020-05-24 12:30:48 -07:00
debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead`);
2018-10-22 11:39:42 -07:00
callback(null);
});
});
}
2015-09-14 11:02:06 -07:00
2018-12-16 20:30:09 -08:00
function run(intervalSecs, callback) {
assert.strictEqual(typeof intervalSecs, 'number');
assert.strictEqual(typeof callback, 'function');
2015-09-14 16:01:37 -07:00
2018-10-22 11:39:42 -07:00
async.series([
2018-12-16 20:30:09 -08:00
processApp, // this is first because docker.getEvents seems to get 'stuck' sometimes
processDockerEvents.bind(null, intervalSecs)
2018-10-22 11:39:42 -07:00
], function (error) {
if (error) debug(error);
callback();
});
2015-09-14 11:02:06 -07:00
}