docker.js and services.js: async'ify
This commit is contained in:
+78
-105
@@ -2,7 +2,6 @@
|
||||
|
||||
const apps = require('./apps.js'),
|
||||
assert = require('assert'),
|
||||
async = require('async'),
|
||||
auditSource = require('./auditsource.js'),
|
||||
BoxError = require('./boxerror.js'),
|
||||
constants = require('./constants.js'),
|
||||
@@ -10,24 +9,21 @@ const apps = require('./apps.js'),
|
||||
docker = require('./docker.js'),
|
||||
eventlog = require('./eventlog.js'),
|
||||
safe = require('safetydance'),
|
||||
superagent = require('superagent'),
|
||||
util = require('util');
|
||||
superagent = require('superagent');
|
||||
|
||||
exports = module.exports = {
|
||||
run
|
||||
};
|
||||
|
||||
const HEALTHCHECK_INTERVAL = 10 * 1000; // every 10 seconds. this needs to be small since the UI makes only healthy apps clickable
|
||||
const UNHEALTHY_THRESHOLD = 20 * 60 * 1000; // 20 minutes
|
||||
|
||||
const OOM_EVENT_LIMIT = 60 * 60 * 1000; // will only raise 1 oom event every hour
|
||||
let gStartTime = null; // time when apphealthmonitor was started
|
||||
let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago
|
||||
|
||||
function setHealth(app, health, callback) {
|
||||
async function setHealth(app, health) {
|
||||
assert.strictEqual(typeof app, 'object');
|
||||
assert.strictEqual(typeof health, 'string');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
|
||||
// app starts out with null health
|
||||
// if it became healthy, we update immediately. this is required for ui to say "running" etc
|
||||
@@ -53,68 +49,60 @@ function setHealth(app, health, callback) {
|
||||
}
|
||||
} else {
|
||||
debug(`setHealth: ${app.id} (${app.fqdn}) waiting for ${(UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000} to update health`);
|
||||
return callback(null);
|
||||
return;
|
||||
}
|
||||
|
||||
util.callbackify(apps.setHealth)(app.id, health, healthTime, function (error) {
|
||||
if (error && error.reason === BoxError.NOT_FOUND) return callback(null); // app uninstalled?
|
||||
if (error) return callback(error);
|
||||
const [error] = await safe(apps.setHealth(app.id, health, healthTime));
|
||||
if (error && error.reason === BoxError.NOT_FOUND) return; // app uninstalled?
|
||||
if (error) throw error;
|
||||
|
||||
app.health = health;
|
||||
app.healthTime = healthTime;
|
||||
|
||||
callback(null);
|
||||
});
|
||||
app.health = health;
|
||||
app.healthTime = healthTime;
|
||||
}
|
||||
|
||||
|
||||
// callback is called with error for fatal errors and not if health check failed
|
||||
function checkAppHealth(app, callback) {
|
||||
async function checkAppHealth(app, options) {
|
||||
assert.strictEqual(typeof app, 'object');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
assert.strictEqual(typeof options, 'object');
|
||||
|
||||
if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) {
|
||||
return callback(null);
|
||||
}
|
||||
if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) return;
|
||||
|
||||
const manifest = app.manifest;
|
||||
|
||||
docker.inspect(app.containerId, function (error, data) {
|
||||
if (error || !data || !data.State) return setHealth(app, apps.HEALTH_ERROR, callback);
|
||||
if (data.State.Running !== true) return setHealth(app, apps.HEALTH_DEAD, callback);
|
||||
const [error, data] = await safe(docker.inspect(app.containerId));
|
||||
if (error || !data || !data.State) return await setHealth(app, apps.HEALTH_ERROR);
|
||||
if (data.State.Running !== true) return await setHealth(app, apps.HEALTH_DEAD);
|
||||
|
||||
// non-appstore apps may not have healthCheckPath
|
||||
if (!manifest.healthCheckPath) return setHealth(app, apps.HEALTH_HEALTHY, callback);
|
||||
// non-appstore apps may not have healthCheckPath
|
||||
if (!manifest.healthCheckPath) return await setHealth(app, apps.HEALTH_HEALTHY);
|
||||
|
||||
const healthCheckUrl = `http://${app.containerIp}:${manifest.httpPort}${manifest.healthCheckPath}`;
|
||||
superagent
|
||||
.get(healthCheckUrl)
|
||||
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
|
||||
.set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio)
|
||||
.redirects(0)
|
||||
.timeout(HEALTHCHECK_INTERVAL)
|
||||
.end(function (error, res) {
|
||||
if (error && !error.response) {
|
||||
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
|
||||
} else if (res.statusCode > 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites)
|
||||
setHealth(app, apps.HEALTH_UNHEALTHY, callback);
|
||||
} else {
|
||||
setHealth(app, apps.HEALTH_HEALTHY, callback);
|
||||
}
|
||||
});
|
||||
});
|
||||
const healthCheckUrl = `http://${app.containerIp}:${manifest.httpPort}${manifest.healthCheckPath}`;
|
||||
const [healthCheckError, response] = await safe(superagent
|
||||
.get(healthCheckUrl)
|
||||
.set('Host', app.fqdn) // required for some apache configs with rewrite rules
|
||||
.set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio)
|
||||
.redirects(0)
|
||||
.ok(() => true)
|
||||
.timeout(options.timeout * 1000));
|
||||
|
||||
if (healthCheckError) {
|
||||
await setHealth(app, apps.HEALTH_UNHEALTHY);
|
||||
} else if (response.status > 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites)
|
||||
await setHealth(app, apps.HEALTH_UNHEALTHY);
|
||||
} else {
|
||||
await setHealth(app, apps.HEALTH_HEALTHY);
|
||||
}
|
||||
}
|
||||
|
||||
function getContainerInfo(containerId, callback) {
|
||||
docker.inspect(containerId, function (error, result) {
|
||||
if (error) return callback(error);
|
||||
async function getContainerInfo(containerId) {
|
||||
const result = await docker.inspect(containerId);
|
||||
|
||||
const appId = safe.query(result, 'Config.Labels.appId', null);
|
||||
const appId = safe.query(result, 'Config.Labels.appId', null);
|
||||
|
||||
if (!appId) return callback(null, null /* app */, { name: result.Name.slice(1) }); // addon . Name has a '/' in the beginning for some reason
|
||||
if (!appId) return { app: null, addon: result.Name.slice(1) }; // addon . Name has a '/' in the beginning for some reason
|
||||
|
||||
util.callbackify(apps.get)(appId, callback); // don't get by container id as this can be an exec container
|
||||
});
|
||||
return await apps.get(appId); // don't get by container id as this can be an exec container
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -122,83 +110,68 @@ function getContainerInfo(containerId, callback) {
|
||||
docker run -ti -m 100M cloudron/base:3.0.0 /bin/bash
|
||||
stress --vm 1 --vm-bytes 200M --vm-hang 0
|
||||
*/
|
||||
function processDockerEvents(intervalSecs, callback) {
|
||||
assert.strictEqual(typeof intervalSecs, 'number');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
async function processDockerEvents(options) {
|
||||
assert.strictEqual(typeof options, 'object');
|
||||
|
||||
const since = ((new Date().getTime() / 1000) - intervalSecs).toFixed(0);
|
||||
const since = ((new Date().getTime() / 1000) - options.intervalSecs).toFixed(0);
|
||||
const until = ((new Date().getTime() / 1000) - 1).toFixed(0);
|
||||
|
||||
docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }, function (error, stream) {
|
||||
if (error) return callback(error);
|
||||
const stream = await docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) });
|
||||
stream.setEncoding('utf8');
|
||||
stream.on('data', async function (data) {
|
||||
const event = JSON.parse(data);
|
||||
const containerId = String(event.id);
|
||||
|
||||
stream.setEncoding('utf8');
|
||||
stream.on('data', function (data) {
|
||||
const event = JSON.parse(data);
|
||||
const containerId = String(event.id);
|
||||
const [error, info] = await safe(getContainerInfo(containerId));
|
||||
const program = error ? containerId : (info.app ? info.app.fqdn : info.addon.name);
|
||||
const now = Date.now();
|
||||
const notifyUser = !(info.app && info.app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT);
|
||||
|
||||
getContainerInfo(containerId, function (error, app, addon) {
|
||||
const program = error ? containerId : (app ? app.fqdn : addon.name);
|
||||
const now = Date.now();
|
||||
const notifyUser = !(app && app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT);
|
||||
debug(`OOM ${program} notifyUser: ${notifyUser}. lastOomTime: ${gLastOomMailTime} (now: ${now})`);
|
||||
|
||||
debug(`OOM ${program} notifyUser: ${notifyUser}. lastOomTime: ${gLastOomMailTime} (now: ${now})`);
|
||||
// do not send mails for dev apps
|
||||
if (notifyUser) {
|
||||
// app can be null for addon containers
|
||||
await eventlog.add(eventlog.ACTION_APP_OOM, auditSource.HEALTH_MONITOR, { event, containerId, addon: info.addon || null, app: info.app || null });
|
||||
|
||||
// do not send mails for dev apps
|
||||
if (notifyUser) {
|
||||
// app can be null for addon containers
|
||||
eventlog.add(eventlog.ACTION_APP_OOM, auditSource.HEALTH_MONITOR, { event, containerId, addon: addon || null, app: app || null });
|
||||
|
||||
gLastOomMailTime = now;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
stream.on('error', function (error) {
|
||||
debug('Error reading docker events', error);
|
||||
callback();
|
||||
});
|
||||
|
||||
stream.on('end', callback);
|
||||
|
||||
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
|
||||
setTimeout(stream.destroy.bind(stream), 3000); // https://github.com/apocas/dockerode/issues/179
|
||||
gLastOomMailTime = now;
|
||||
}
|
||||
});
|
||||
|
||||
stream.on('error', function (error) {
|
||||
debug('Error reading docker events', error);
|
||||
});
|
||||
|
||||
stream.on('end', function () {
|
||||
// debug('Event stream ended');
|
||||
});
|
||||
|
||||
// safety hatch if 'until' doesn't work (there are cases where docker is working with a different time)
|
||||
setTimeout(stream.destroy.bind(stream), options.timeout); // https://github.com/apocas/dockerode/issues/179
|
||||
}
|
||||
|
||||
function processApp(callback) {
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
async function processApp(options) {
|
||||
assert.strictEqual(typeof options, 'object');
|
||||
|
||||
const appsList = util.callbackify(apps.list);
|
||||
const allApps = await apps.list();
|
||||
|
||||
appsList(function (error, allApps) {
|
||||
if (error) return callback(error);
|
||||
const healthChecks = allApps.map((app) => checkAppHealth(app, options)); // start healthcheck in parallel
|
||||
|
||||
async.each(allApps, checkAppHealth, function (error) {
|
||||
const alive = allApps
|
||||
.filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; });
|
||||
await Promise.allSettled(healthChecks); // wait for all promises to finish
|
||||
|
||||
debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.` + (error ? ` ${error.reason}` : ''));
|
||||
const alive = allApps
|
||||
.filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; });
|
||||
|
||||
callback(null);
|
||||
});
|
||||
});
|
||||
debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.`);
|
||||
}
|
||||
|
||||
function run(intervalSecs, callback) {
|
||||
async function run(intervalSecs) {
|
||||
assert.strictEqual(typeof intervalSecs, 'number');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
|
||||
if (constants.TEST) return;
|
||||
|
||||
if (!gStartTime) gStartTime = new Date();
|
||||
|
||||
async.series([
|
||||
processApp, // this is first because docker.getEvents seems to get 'stuck' sometimes
|
||||
processDockerEvents.bind(null, intervalSecs)
|
||||
], function (error) {
|
||||
if (error) debug(`run: could not check app health. ${error.message}`);
|
||||
|
||||
callback();
|
||||
});
|
||||
await processApp({ timeout: (intervalSecs - 3) * 1000 });
|
||||
await processDockerEvents({ intervalSecs, timeout: 3000 });
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user