Files
cloudron-box/src/apphealthmonitor.js
T

182 lines
7.3 KiB
JavaScript
Raw Normal View History

'use strict';
2021-08-20 09:19:44 -07:00
const apps = require('./apps.js'),
assert = require('assert'),
2021-09-30 09:50:30 -07:00
AuditSource = require('./auditsource.js'),
BoxError = require('./boxerror.js'),
2021-06-03 12:20:44 -07:00
constants = require('./constants.js'),
2015-09-14 10:52:11 -07:00
debug = require('debug')('box:apphealthmonitor'),
2019-03-06 11:54:37 -08:00
docker = require('./docker.js'),
2019-01-17 15:31:34 +01:00
eventlog = require('./eventlog.js'),
2019-03-06 11:54:37 -08:00
safe = require('safetydance'),
2021-08-25 19:41:46 -07:00
superagent = require('superagent');
exports = module.exports = {
2020-10-07 14:47:51 -07:00
run
};
2021-03-04 12:03:59 -08:00
const UNHEALTHY_THRESHOLD = 20 * 60 * 1000; // 20 minutes
2018-10-22 11:39:42 -07:00
2021-06-23 17:15:37 -07:00
const OOM_EVENT_LIMIT = 60 * 60 * 1000; // will only raise 1 oom event every hour
2021-03-04 12:03:59 -08:00
let gStartTime = null; // time when apphealthmonitor was started
2018-12-16 20:30:09 -08:00
let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago
2021-08-25 19:41:46 -07:00
async function setHealth(app, health) {
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof health, 'string');
2021-03-04 12:03:59 -08:00
// app starts out with null health
// if it became healthy, we update immediately. this is required for ui to say "running" etc
// if it became unhealthy/error/dead, wait for a threshold before updating db
2021-03-04 11:42:43 -08:00
const now = new Date(), lastHealth = app.health;
let healthTime = gStartTime > app.healthTime ? gStartTime : app.healthTime; // on box restart, clamp value to start time
2019-08-30 13:12:49 -07:00
if (health === apps.HEALTH_HEALTHY) {
2019-02-12 16:03:12 -08:00
healthTime = now;
2021-03-04 11:42:43 -08:00
if (lastHealth && lastHealth !== apps.HEALTH_HEALTHY) { // app starts out with null health
debug(`setHealth: ${app.id} (${app.fqdn}) switched from ${lastHealth} to healthy`);
2019-02-12 16:03:12 -08:00
// do not send mails for dev apps
2022-02-24 20:04:46 -08:00
if (!app.debugMode) await eventlog.add(eventlog.ACTION_APP_UP, AuditSource.HEALTH_MONITOR, { app: app });
2019-02-12 16:03:12 -08:00
}
} else if (Math.abs(now - healthTime) > UNHEALTHY_THRESHOLD) {
2021-03-04 11:42:43 -08:00
if (lastHealth === apps.HEALTH_HEALTHY) {
debug(`setHealth: marking ${app.id} (${app.fqdn}) as unhealthy since not seen for more than ${UNHEALTHY_THRESHOLD/(60 * 1000)} minutes`);
2019-02-12 16:03:12 -08:00
// do not send mails for dev apps
2022-02-24 20:04:46 -08:00
if (!app.debugMode) await eventlog.add(eventlog.ACTION_APP_DOWN, AuditSource.HEALTH_MONITOR, { app: app });
2019-02-12 16:03:12 -08:00
}
} else {
2021-03-04 11:42:43 -08:00
debug(`setHealth: ${app.id} (${app.fqdn}) waiting for ${(UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000} to update health`);
2021-08-25 19:41:46 -07:00
return;
}
2021-08-25 19:41:46 -07:00
const [error] = await safe(apps.setHealth(app.id, health, healthTime));
if (error && error.reason === BoxError.NOT_FOUND) return; // app uninstalled?
if (error) throw error;
2021-08-25 19:41:46 -07:00
app.health = health;
app.healthTime = healthTime;
}
// callback is called with error for fatal errors and not if health check failed
2021-08-25 19:41:46 -07:00
async function checkAppHealth(app, options) {
2018-02-08 15:07:49 +01:00
assert.strictEqual(typeof app, 'object');
2021-08-25 19:41:46 -07:00
assert.strictEqual(typeof options, 'object');
2018-02-08 15:07:49 +01:00
2021-08-25 19:41:46 -07:00
if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) return;
2019-03-06 11:54:37 -08:00
const manifest = app.manifest;
2021-08-25 19:41:46 -07:00
const [error, data] = await safe(docker.inspect(app.containerId));
if (error || !data || !data.State) return await setHealth(app, apps.HEALTH_ERROR);
if (data.State.Running !== true) return await setHealth(app, apps.HEALTH_DEAD);
// non-appstore apps may not have healthCheckPath
if (!manifest.healthCheckPath) return await setHealth(app, apps.HEALTH_HEALTHY);
const healthCheckUrl = `http://${app.containerIp}:${manifest.httpPort}${manifest.healthCheckPath}`;
const [healthCheckError, response] = await safe(superagent
.get(healthCheckUrl)
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
.set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio)
.redirects(0)
.ok(() => true)
.timeout(options.timeout * 1000));
if (healthCheckError) {
await setHealth(app, apps.HEALTH_UNHEALTHY);
} else if (response.status > 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites)
await setHealth(app, apps.HEALTH_UNHEALTHY);
} else {
await setHealth(app, apps.HEALTH_HEALTHY);
}
}
2021-08-25 19:41:46 -07:00
async function getContainerInfo(containerId) {
const result = await docker.inspect(containerId);
2019-03-06 11:54:37 -08:00
2021-08-25 19:41:46 -07:00
const appId = safe.query(result, 'Config.Labels.appId', null);
2021-09-19 17:32:48 -07:00
if (appId) return { app: await apps.get(appId) }; // don't get by container id as this can be an exec container
2019-03-06 11:54:37 -08:00
2021-09-19 17:32:48 -07:00
if (result.Name.startsWith('/redis-')) {
return { app: await apps.get(result.Name.slice('/redis-'.length)), addonName: 'redis' };
} else {
return { addonName: result.Name.slice(1) }; // addon . Name has a '/' in the beginning for some reason
}
2019-03-06 11:54:37 -08:00
}
2015-09-14 17:20:30 -07:00
/*
OOM can be tested using stress tool like so:
2021-06-23 17:15:37 -07:00
docker run -ti -m 100M cloudron/base:3.0.0 /bin/bash
2015-09-14 17:20:30 -07:00
stress --vm 1 --vm-bytes 200M --vm-hang 0
*/
2021-08-25 19:41:46 -07:00
async function processDockerEvents(options) {
assert.strictEqual(typeof options, 'object');
2016-07-25 14:19:20 -07:00
2021-08-25 19:41:46 -07:00
const since = ((new Date().getTime() / 1000) - options.intervalSecs).toFixed(0);
2018-10-22 11:39:42 -07:00
const until = ((new Date().getTime() / 1000) - 1).toFixed(0);
2015-09-14 16:01:37 -07:00
2021-08-25 19:41:46 -07:00
const stream = await docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) });
stream.setEncoding('utf8');
stream.on('data', async function (data) { // this is actually ldjson, we only process the first line for now
const event = safe.JSON.parse(data);
if (!event) return;
2021-08-25 19:41:46 -07:00
const containerId = String(event.id);
2015-09-14 17:06:04 -07:00
2021-08-25 19:41:46 -07:00
const [error, info] = await safe(getContainerInfo(containerId));
2021-09-19 17:32:48 -07:00
const program = error ? containerId : (info.addonName || info.app.fqdn);
2021-08-25 19:41:46 -07:00
const now = Date.now();
2021-09-19 17:32:48 -07:00
// do not send mails for dev apps
2021-08-25 19:41:46 -07:00
const notifyUser = !(info.app && info.app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT);
2018-12-16 20:30:09 -08:00
2021-08-25 19:41:46 -07:00
debug(`OOM ${program} notifyUser: ${notifyUser}. lastOomTime: ${gLastOomMailTime} (now: ${now})`);
2021-08-25 19:41:46 -07:00
if (notifyUser) {
2021-09-30 09:50:30 -07:00
await eventlog.add(eventlog.ACTION_APP_OOM, AuditSource.HEALTH_MONITOR, { event, containerId, addonName: info?.addonName || null, app: info?.app || null });
2019-01-17 15:31:34 +01:00
2021-08-25 19:41:46 -07:00
gLastOomMailTime = now;
}
});
2015-09-14 16:01:37 -07:00
2021-08-25 19:41:46 -07:00
stream.on('error', function (error) {
debug('Error reading docker events', error);
});
2018-10-22 11:39:42 -07:00
2021-08-25 19:41:46 -07:00
stream.on('end', function () {
// debug('Event stream ended');
2015-09-14 16:01:37 -07:00
});
2021-08-25 19:41:46 -07:00
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
setTimeout(stream.destroy.bind(stream), options.timeout); // https://github.com/apocas/dockerode/issues/179
2015-09-14 16:01:37 -07:00
}
2021-08-25 19:41:46 -07:00
async function processApp(options) {
assert.strictEqual(typeof options, 'object');
2021-08-25 19:41:46 -07:00
const allApps = await apps.list();
2021-08-20 09:19:44 -07:00
2021-08-25 19:41:46 -07:00
const healthChecks = allApps.map((app) => checkAppHealth(app, options)); // start healthcheck in parallel
2015-09-14 16:01:37 -07:00
2021-08-25 19:41:46 -07:00
await Promise.allSettled(healthChecks); // wait for all promises to finish
2015-09-14 16:01:37 -07:00
2021-08-25 19:41:46 -07:00
const alive = allApps
.filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; });
2018-10-22 11:39:42 -07:00
2021-08-25 19:41:46 -07:00
debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.`);
}
2015-09-14 11:02:06 -07:00
2021-08-25 19:41:46 -07:00
async function run(intervalSecs) {
2018-12-16 20:30:09 -08:00
assert.strictEqual(typeof intervalSecs, 'number');
2015-09-14 16:01:37 -07:00
2021-06-03 12:20:44 -07:00
if (constants.TEST) return;
2021-03-04 12:03:59 -08:00
if (!gStartTime) gStartTime = new Date();
2021-08-25 19:41:46 -07:00
await processApp({ timeout: (intervalSecs - 3) * 1000 });
await processDockerEvents({ intervalSecs, timeout: 3000 });
2015-09-14 11:02:06 -07:00
}