'use strict'; const apps = require('./apps.js'), assert = require('assert'), AuditSource = require('./auditsource.js'), BoxError = require('./boxerror.js'), constants = require('./constants.js'), debug = require('debug')('box:apphealthmonitor'), docker = require('./docker.js'), eventlog = require('./eventlog.js'), safe = require('safetydance'), superagent = require('superagent'); exports = module.exports = { run }; const UNHEALTHY_THRESHOLD = 20 * 60 * 1000; // 20 minutes const OOM_EVENT_LIMIT = 60 * 60 * 1000; // will only raise 1 oom event every hour let gStartTime = null; // time when apphealthmonitor was started let gLastOomMailTime = Date.now() - (5 * 60 * 1000); // pretend we sent email 5 minutes ago async function setHealth(app, health) { assert.strictEqual(typeof app, 'object'); assert.strictEqual(typeof health, 'string'); // app starts out with null health // if it became healthy, we update immediately. this is required for ui to say "running" etc // if it became unhealthy/error/dead, wait for a threshold before updating db const now = new Date(), lastHealth = app.health; let healthTime = gStartTime > app.healthTime ? gStartTime : app.healthTime; // on box restart, clamp value to start time if (health === apps.HEALTH_HEALTHY) { healthTime = now; if (lastHealth && lastHealth !== apps.HEALTH_HEALTHY) { // app starts out with null health debug(`setHealth: ${app.id} (${app.fqdn}) switched from ${lastHealth} to healthy`); // do not send mails for dev apps if (!app.debugMode) await eventlog.add(eventlog.ACTION_APP_UP, AuditSource.HEALTH_MONITOR, { app: app }); } } else if (Math.abs(now - healthTime) > UNHEALTHY_THRESHOLD) { if (lastHealth === apps.HEALTH_HEALTHY) { debug(`setHealth: marking ${app.id} (${app.fqdn}) as unhealthy since not seen for more than ${UNHEALTHY_THRESHOLD/(60 * 1000)} minutes`); // do not send mails for dev apps if (!app.debugMode) await eventlog.add(eventlog.ACTION_APP_DOWN, AuditSource.HEALTH_MONITOR, { app: app }); } } else { debug(`setHealth: ${app.id} (${app.fqdn}) waiting for ${(UNHEALTHY_THRESHOLD - Math.abs(now - healthTime))/1000} to update health`); return; } const [error] = await safe(apps.setHealth(app.id, health, healthTime)); if (error && error.reason === BoxError.NOT_FOUND) return; // app uninstalled? if (error) throw error; app.health = health; app.healthTime = healthTime; } // callback is called with error for fatal errors and not if health check failed async function checkAppHealth(app, options) { assert.strictEqual(typeof app, 'object'); assert.strictEqual(typeof options, 'object'); if (app.installationState !== apps.ISTATE_INSTALLED || app.runState !== apps.RSTATE_RUNNING) return; const manifest = app.manifest; const [error, data] = await safe(docker.inspect(app.containerId)); if (error || !data || !data.State) return await setHealth(app, apps.HEALTH_ERROR); if (data.State.Running !== true) return await setHealth(app, apps.HEALTH_DEAD); // non-appstore apps may not have healthCheckPath if (!manifest.healthCheckPath) return await setHealth(app, apps.HEALTH_HEALTHY); const healthCheckUrl = `http://${app.containerIp}:${manifest.httpPort}${manifest.healthCheckPath}`; const [healthCheckError, response] = await safe(superagent .get(healthCheckUrl) .set('Host', app.fqdn) // required for some apache configs with rewrite rules .set('User-Agent', 'Mozilla (CloudronHealth)') // required for some apps (e.g. minio) .redirects(0) .ok(() => true) .timeout(options.timeout * 1000)); if (healthCheckError) { await setHealth(app, apps.HEALTH_UNHEALTHY); } else if (response.status > 403) { // 2xx and 3xx are ok. even 401 and 403 are ok for now (for WP sites) await setHealth(app, apps.HEALTH_UNHEALTHY); } else { await setHealth(app, apps.HEALTH_HEALTHY); } } async function getContainerInfo(containerId) { const result = await docker.inspect(containerId); const appId = safe.query(result, 'Config.Labels.appId', null); if (appId) return { app: await apps.get(appId) }; // don't get by container id as this can be an exec container if (result.Name.startsWith('/redis-')) { return { app: await apps.get(result.Name.slice('/redis-'.length)), addonName: 'redis' }; } else { return { addonName: result.Name.slice(1) }; // addon . Name has a '/' in the beginning for some reason } } /* OOM can be tested using stress tool like so: docker run -ti -m 100M cloudron/base:3.0.0 /bin/bash stress --vm 1 --vm-bytes 200M --vm-hang 0 */ async function processDockerEvents(options) { assert.strictEqual(typeof options, 'object'); const since = ((new Date().getTime() / 1000) - options.intervalSecs).toFixed(0); const until = ((new Date().getTime() / 1000) - 1).toFixed(0); const stream = await docker.getEvents({ since: since, until: until, filters: JSON.stringify({ event: [ 'oom' ] }) }); stream.setEncoding('utf8'); stream.on('data', async function (data) { // this is actually ldjson, we only process the first line for now const event = safe.JSON.parse(data); if (!event) return; const containerId = String(event.id); const [error, info] = await safe(getContainerInfo(containerId)); const program = error ? containerId : (info.addonName || info.app.fqdn); const now = Date.now(); // do not send mails for dev apps const notifyUser = !(info.app && info.app.debugMode) && ((now - gLastOomMailTime) > OOM_EVENT_LIMIT); debug(`OOM ${program} notifyUser: ${notifyUser}. lastOomTime: ${gLastOomMailTime} (now: ${now})`); if (notifyUser) { await eventlog.add(eventlog.ACTION_APP_OOM, AuditSource.HEALTH_MONITOR, { event, containerId, addonName: info?.addonName || null, app: info?.app || null }); gLastOomMailTime = now; } }); stream.on('error', function (error) { debug('Error reading docker events', error); }); stream.on('end', function () { // debug('Event stream ended'); }); // safety hatch if 'until' doesn't work (there are cases where docker is working with a different time) setTimeout(stream.destroy.bind(stream), options.timeout); // https://github.com/apocas/dockerode/issues/179 } async function processApp(options) { assert.strictEqual(typeof options, 'object'); const allApps = await apps.list(); const healthChecks = allApps.map((app) => checkAppHealth(app, options)); // start healthcheck in parallel await Promise.allSettled(healthChecks); // wait for all promises to finish const alive = allApps .filter(function (a) { return a.installationState === apps.ISTATE_INSTALLED && a.runState === apps.RSTATE_RUNNING && a.health === apps.HEALTH_HEALTHY; }); debug(`app health: ${alive.length} alive / ${allApps.length - alive.length} dead.`); } async function run(intervalSecs) { assert.strictEqual(typeof intervalSecs, 'number'); if (constants.TEST) return; if (!gStartTime) gStartTime = new Date(); await processApp({ timeout: (intervalSecs - 3) * 1000 }); await processDockerEvents({ intervalSecs, timeout: 3000 }); }