Better OOM notification messages

This commit is contained in:
Girish Ramakrishnan
2019-03-06 11:54:37 -08:00
parent 916ca87db4
commit b5a4121574
5 changed files with 72 additions and 36 deletions
+24 -13
View File
@@ -6,8 +6,9 @@ var appdb = require('./appdb.js'),
async = require('async'),
DatabaseError = require('./databaseerror.js'),
debug = require('debug')('box:apphealthmonitor'),
docker = require('./docker.js').connection,
docker = require('./docker.js'),
eventlog = require('./eventlog.js'),
safe = require('safetydance'),
superagent = require('superagent'),
util = require('util');
@@ -77,11 +78,10 @@ function checkAppHealth(app, callback) {
return callback(null);
}
var container = docker.getContainer(app.containerId),
manifest = app.manifest;
const manifest = app.manifest;
container.inspect(function (err, data) {
if (err || !data || !data.State) {
docker.inspect(app.containerId, function (error, data) {
if (error || !data || !data.State) {
debugApp(app, 'Error inspecting container');
return setHealth(app, appdb.HEALTH_ERROR, callback);
}
@@ -116,6 +116,18 @@ function checkAppHealth(app, callback) {
});
}
function getContainerInfo(containerId, callback) {
docker.inspect(containerId, function (error, result) {
if (error) return callback(error);
const appId = safe.query(result, 'Config.Labels.appId', null);
if (!appId) return callback(null, null /* app */, { name: result.Name }); // addon
apps.get(appId, callback); // don't get by container id as this can be an exec container
});
}
/*
OOM can be tested using stress tool like so:
docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash
@@ -134,21 +146,20 @@ function processDockerEvents(intervalSecs, callback) {
stream.setEncoding('utf8');
stream.on('data', function (data) {
var ev = JSON.parse(data);
var containerId = ev.id;
appdb.getByContainerId(containerId, function (error, app) { // this can error for addons
var program = error || !app.id ? containerId : `app-${app.id}`;
var now = Date.now();
const event = JSON.parse(data);
const containerId = String(event.id);
getContainerInfo(containerId, function (error, app, addon) {
const program = error ? containerId : (app ? app.fqdn : addon.name);
const now = Date.now();
const notifyUser = (!app || !app.debugMode) && (now - gLastOomMailTime > OOM_MAIL_LIMIT);
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now, ev);
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now);
// do not send mails for dev apps
if (notifyUser) {
// app can be null for addon containers
eventlog.add(eventlog.ACTION_APP_OOM, AUDIT_SOURCE, { ev: ev, containerId: containerId, app: app || null });
eventlog.add(eventlog.ACTION_APP_OOM, AUDIT_SOURCE, { event: event, containerId: containerId, addon: addon || null, app: app || null });
gLastOomMailTime = now;
}
+14
View File
@@ -22,6 +22,7 @@ exports = module.exports = {
getContainerIdByIp: getContainerIdByIp,
inspect: inspect,
inspectByName: inspect,
getEvents: getEvents,
memoryUsage: memoryUsage,
execContainer: execContainer,
createVolume: createVolume,
@@ -474,6 +475,19 @@ function inspect(containerId, callback) {
});
}
function getEvents(options, callback) {
assert.strictEqual(typeof options, 'object');
assert.strictEqual(typeof callback, 'function');
let docker = exports.connection;
docker.getEvents(options, function (error, stream) {
if (error) return callback(new DockerError(DockerError.INTERNAL_ERROR, error));
callback(null, stream);
});
}
function memoryUsage(containerId, callback) {
assert.strictEqual(typeof containerId, 'string');
assert.strictEqual(typeof callback, 'function');
+7 -8
View File
@@ -2,22 +2,21 @@
Dear <%= cloudronName %> Admin,
<%= program %> exited unexpectedly using too much memory!
<%= program %> has bee restarted now as it ran out of memory.
The app has been restarted now. Should this message appear repeatedly or
undefined behavior is observed, give the app more memory.
This can be done in the advanced settings in the app configuration dialog
in your Cloudron's web interface.
Should this message appear repeatedly or undefined behavior is observed, give the app more memory.
Please see some excerpt of the logs below.
* To increase an app's memory limit - https://cloudron.io/documentation/apps/#increasing-the-memory-limit-of-an-app
* To increase a service's memory limit - https://cloudron.io/documentation/troubleshooting/#services
Out of memory event:
-------------------------------------
<%- context %>
<%- event %>
-------------------------------------
Powered by https://cloudron.io
Sent at: <%= new Date().toUTCString() %>
+4 -4
View File
@@ -481,10 +481,10 @@ function certificateRenewalError(domain, message) {
});
}
function oomEvent(mailTo, program, context) {
function oomEvent(mailTo, program, event) {
assert.strictEqual(typeof mailTo, 'string');
assert.strictEqual(typeof program, 'string');
assert.strictEqual(typeof context, 'string');
assert.strictEqual(typeof event, 'object');
getMailConfig(function (error, mailConfig) {
if (error) return debug('Error getting mail details:', error);
@@ -492,8 +492,8 @@ function oomEvent(mailTo, program, context) {
var mailOptions = {
from: mailConfig.notificationFrom,
to: mailTo,
subject: util.format('[%s] %s exited unexpectedly', mailConfig.cloudronName, program),
text: render('oom_event.ejs', { cloudronName: mailConfig.cloudronName, program: program, context: context, format: 'text' })
subject: util.format('[%s] %s was restarted (OOM)', mailConfig.cloudronName, program),
text: render('oom_event.ejs', { cloudronName: mailConfig.cloudronName, program: program, event: event, format: 'text' })
};
sendMails([ mailOptions ]);
+23 -11
View File
@@ -180,23 +180,35 @@ function adminChanged(performedBy, eventId, user, callback) {
}, callback);
}
function oomEvent(eventId, program, context, callback) {
function oomEvent(eventId, app, addon, containerId, event, callback) {
assert.strictEqual(typeof eventId, 'string');
assert.strictEqual(typeof program, 'string');
assert.strictEqual(typeof context, 'object');
assert.strictEqual(typeof app, 'object');
assert.strictEqual(typeof addon, 'object');
assert.strictEqual(typeof containerId, 'string');
assert.strictEqual(typeof callback, 'function');
let title, message, program;
if (app) {
program = app.fqdn;
title = `The application ${app.fqdn} (${app.manifest.title}) ran out of memory.`;
message = 'The application has been restarted automatically. If you see this notification often, consider increasing the [memory limit](https://cloudron.io/documentation/apps/#increasing-the-memory-limit-of-an-app)';
} else if (addon) {
program = addon.name;
title = `The ${addon.name} service ran out of memory`;
message = 'The service has been restarted automatically. If you see this notification often, consider increasing the [memory limit](https://cloudron.io/documentation/troubleshooting/#services)';
} else {
program = containerId;
title = `The container ${containerId} ran out of memory`;
message = 'The container has been restarted automatically. Consider increasing the [memory limit](https://docs.docker.com/v17.09/edge/engine/reference/commandline/update/#update-a-containers-kernel-memory-constraints)';
}
// also send us a notification mail
if (config.provider() === 'caas') mailer.oomEvent('support@cloudron.io', program, JSON.stringify(context, null, 4));
if (config.provider() === 'caas') mailer.oomEvent('support@cloudron.io', program, event);
actionForAllAdmins([], function (admin, done) {
mailer.oomEvent(admin.email, program, JSON.stringify(context, null, 4));
mailer.oomEvent(admin.email, program, event);
var message;
if (context.app) message = `The application ${context.app.manifest.title} with id ${context.app.id} ran out of memory.`;
else message = `The container with id ${context.details.id} ran out of memory`;
add(admin.id, eventId, 'Process died out-of-memory', message, done);
add(admin.id, eventId, title, message, done);
}, callback);
}
@@ -351,7 +363,7 @@ function onEvent(id, action, source, data, callback) {
case eventlog.ACTION_USER_ADD: return userAdded(source.userId, id, data.user, callback);
case eventlog.ACTION_USER_REMOVE: return userRemoved(source.userId, id, data.user, callback);
case eventlog.ACTION_USER_UPDATE: return data.adminStatusChanged ? adminChanged(source.userId, id, data.user, callback) : callback();
case eventlog.ACTION_APP_OOM: return oomEvent(id, data.app ? data.app.id : data.containerId, { app: data.app, details: data }, callback);
case eventlog.ACTION_APP_OOM: return oomEvent(id, data.app, data.addon, data.containerId, data.event, callback);
case eventlog.ACTION_APP_DOWN: return appDied(id, data.app, callback);
case eventlog.ACTION_APP_UP: return appUp(id, data.app, callback);
case eventlog.ACTION_APP_TASK_CRASH: return apptaskCrash(id, data.appId, data.crashLogFile, callback);