Better OOM notification messages
This commit is contained in:
+24
-13
@@ -6,8 +6,9 @@ var appdb = require('./appdb.js'),
|
||||
async = require('async'),
|
||||
DatabaseError = require('./databaseerror.js'),
|
||||
debug = require('debug')('box:apphealthmonitor'),
|
||||
docker = require('./docker.js').connection,
|
||||
docker = require('./docker.js'),
|
||||
eventlog = require('./eventlog.js'),
|
||||
safe = require('safetydance'),
|
||||
superagent = require('superagent'),
|
||||
util = require('util');
|
||||
|
||||
@@ -77,11 +78,10 @@ function checkAppHealth(app, callback) {
|
||||
return callback(null);
|
||||
}
|
||||
|
||||
var container = docker.getContainer(app.containerId),
|
||||
manifest = app.manifest;
|
||||
const manifest = app.manifest;
|
||||
|
||||
container.inspect(function (err, data) {
|
||||
if (err || !data || !data.State) {
|
||||
docker.inspect(app.containerId, function (error, data) {
|
||||
if (error || !data || !data.State) {
|
||||
debugApp(app, 'Error inspecting container');
|
||||
return setHealth(app, appdb.HEALTH_ERROR, callback);
|
||||
}
|
||||
@@ -116,6 +116,18 @@ function checkAppHealth(app, callback) {
|
||||
});
|
||||
}
|
||||
|
||||
function getContainerInfo(containerId, callback) {
|
||||
docker.inspect(containerId, function (error, result) {
|
||||
if (error) return callback(error);
|
||||
|
||||
const appId = safe.query(result, 'Config.Labels.appId', null);
|
||||
|
||||
if (!appId) return callback(null, null /* app */, { name: result.Name }); // addon
|
||||
|
||||
apps.get(appId, callback); // don't get by container id as this can be an exec container
|
||||
});
|
||||
}
|
||||
|
||||
/*
|
||||
OOM can be tested using stress tool like so:
|
||||
docker run -ti -m 100M cloudron/base:0.10.0 /bin/bash
|
||||
@@ -134,21 +146,20 @@ function processDockerEvents(intervalSecs, callback) {
|
||||
|
||||
stream.setEncoding('utf8');
|
||||
stream.on('data', function (data) {
|
||||
var ev = JSON.parse(data);
|
||||
var containerId = ev.id;
|
||||
|
||||
appdb.getByContainerId(containerId, function (error, app) { // this can error for addons
|
||||
var program = error || !app.id ? containerId : `app-${app.id}`;
|
||||
var now = Date.now();
|
||||
const event = JSON.parse(data);
|
||||
const containerId = String(event.id);
|
||||
|
||||
getContainerInfo(containerId, function (error, app, addon) {
|
||||
const program = error ? containerId : (app ? app.fqdn : addon.name);
|
||||
const now = Date.now();
|
||||
const notifyUser = (!app || !app.debugMode) && (now - gLastOomMailTime > OOM_MAIL_LIMIT);
|
||||
|
||||
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now, ev);
|
||||
debug('OOM %s notifyUser: %s. lastOomTime: %s (now: %s)', program, notifyUser, gLastOomMailTime, now);
|
||||
|
||||
// do not send mails for dev apps
|
||||
if (notifyUser) {
|
||||
// app can be null for addon containers
|
||||
eventlog.add(eventlog.ACTION_APP_OOM, AUDIT_SOURCE, { ev: ev, containerId: containerId, app: app || null });
|
||||
eventlog.add(eventlog.ACTION_APP_OOM, AUDIT_SOURCE, { event: event, containerId: containerId, addon: addon || null, app: app || null });
|
||||
|
||||
gLastOomMailTime = now;
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ exports = module.exports = {
|
||||
getContainerIdByIp: getContainerIdByIp,
|
||||
inspect: inspect,
|
||||
inspectByName: inspect,
|
||||
getEvents: getEvents,
|
||||
memoryUsage: memoryUsage,
|
||||
execContainer: execContainer,
|
||||
createVolume: createVolume,
|
||||
@@ -474,6 +475,19 @@ function inspect(containerId, callback) {
|
||||
});
|
||||
}
|
||||
|
||||
function getEvents(options, callback) {
|
||||
assert.strictEqual(typeof options, 'object');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
|
||||
let docker = exports.connection;
|
||||
|
||||
docker.getEvents(options, function (error, stream) {
|
||||
if (error) return callback(new DockerError(DockerError.INTERNAL_ERROR, error));
|
||||
|
||||
callback(null, stream);
|
||||
});
|
||||
}
|
||||
|
||||
function memoryUsage(containerId, callback) {
|
||||
assert.strictEqual(typeof containerId, 'string');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
|
||||
@@ -2,22 +2,21 @@
|
||||
|
||||
Dear <%= cloudronName %> Admin,
|
||||
|
||||
<%= program %> exited unexpectedly using too much memory!
|
||||
<%= program %> has bee restarted now as it ran out of memory.
|
||||
|
||||
The app has been restarted now. Should this message appear repeatedly or
|
||||
undefined behavior is observed, give the app more memory.
|
||||
This can be done in the advanced settings in the app configuration dialog
|
||||
in your Cloudron's web interface.
|
||||
Should this message appear repeatedly or undefined behavior is observed, give the app more memory.
|
||||
|
||||
Please see some excerpt of the logs below.
|
||||
* To increase an app's memory limit - https://cloudron.io/documentation/apps/#increasing-the-memory-limit-of-an-app
|
||||
* To increase a service's memory limit - https://cloudron.io/documentation/troubleshooting/#services
|
||||
|
||||
Out of memory event:
|
||||
|
||||
-------------------------------------
|
||||
|
||||
<%- context %>
|
||||
<%- event %>
|
||||
|
||||
-------------------------------------
|
||||
|
||||
|
||||
Powered by https://cloudron.io
|
||||
|
||||
Sent at: <%= new Date().toUTCString() %>
|
||||
|
||||
+4
-4
@@ -481,10 +481,10 @@ function certificateRenewalError(domain, message) {
|
||||
});
|
||||
}
|
||||
|
||||
function oomEvent(mailTo, program, context) {
|
||||
function oomEvent(mailTo, program, event) {
|
||||
assert.strictEqual(typeof mailTo, 'string');
|
||||
assert.strictEqual(typeof program, 'string');
|
||||
assert.strictEqual(typeof context, 'string');
|
||||
assert.strictEqual(typeof event, 'object');
|
||||
|
||||
getMailConfig(function (error, mailConfig) {
|
||||
if (error) return debug('Error getting mail details:', error);
|
||||
@@ -492,8 +492,8 @@ function oomEvent(mailTo, program, context) {
|
||||
var mailOptions = {
|
||||
from: mailConfig.notificationFrom,
|
||||
to: mailTo,
|
||||
subject: util.format('[%s] %s exited unexpectedly', mailConfig.cloudronName, program),
|
||||
text: render('oom_event.ejs', { cloudronName: mailConfig.cloudronName, program: program, context: context, format: 'text' })
|
||||
subject: util.format('[%s] %s was restarted (OOM)', mailConfig.cloudronName, program),
|
||||
text: render('oom_event.ejs', { cloudronName: mailConfig.cloudronName, program: program, event: event, format: 'text' })
|
||||
};
|
||||
|
||||
sendMails([ mailOptions ]);
|
||||
|
||||
+23
-11
@@ -180,23 +180,35 @@ function adminChanged(performedBy, eventId, user, callback) {
|
||||
}, callback);
|
||||
}
|
||||
|
||||
function oomEvent(eventId, program, context, callback) {
|
||||
function oomEvent(eventId, app, addon, containerId, event, callback) {
|
||||
assert.strictEqual(typeof eventId, 'string');
|
||||
assert.strictEqual(typeof program, 'string');
|
||||
assert.strictEqual(typeof context, 'object');
|
||||
assert.strictEqual(typeof app, 'object');
|
||||
assert.strictEqual(typeof addon, 'object');
|
||||
assert.strictEqual(typeof containerId, 'string');
|
||||
assert.strictEqual(typeof callback, 'function');
|
||||
|
||||
let title, message, program;
|
||||
if (app) {
|
||||
program = app.fqdn;
|
||||
title = `The application ${app.fqdn} (${app.manifest.title}) ran out of memory.`;
|
||||
message = 'The application has been restarted automatically. If you see this notification often, consider increasing the [memory limit](https://cloudron.io/documentation/apps/#increasing-the-memory-limit-of-an-app)';
|
||||
} else if (addon) {
|
||||
program = addon.name;
|
||||
title = `The ${addon.name} service ran out of memory`;
|
||||
message = 'The service has been restarted automatically. If you see this notification often, consider increasing the [memory limit](https://cloudron.io/documentation/troubleshooting/#services)';
|
||||
} else {
|
||||
program = containerId;
|
||||
title = `The container ${containerId} ran out of memory`;
|
||||
message = 'The container has been restarted automatically. Consider increasing the [memory limit](https://docs.docker.com/v17.09/edge/engine/reference/commandline/update/#update-a-containers-kernel-memory-constraints)';
|
||||
}
|
||||
|
||||
// also send us a notification mail
|
||||
if (config.provider() === 'caas') mailer.oomEvent('support@cloudron.io', program, JSON.stringify(context, null, 4));
|
||||
if (config.provider() === 'caas') mailer.oomEvent('support@cloudron.io', program, event);
|
||||
|
||||
actionForAllAdmins([], function (admin, done) {
|
||||
mailer.oomEvent(admin.email, program, JSON.stringify(context, null, 4));
|
||||
mailer.oomEvent(admin.email, program, event);
|
||||
|
||||
var message;
|
||||
if (context.app) message = `The application ${context.app.manifest.title} with id ${context.app.id} ran out of memory.`;
|
||||
else message = `The container with id ${context.details.id} ran out of memory`;
|
||||
|
||||
add(admin.id, eventId, 'Process died out-of-memory', message, done);
|
||||
add(admin.id, eventId, title, message, done);
|
||||
}, callback);
|
||||
}
|
||||
|
||||
@@ -351,7 +363,7 @@ function onEvent(id, action, source, data, callback) {
|
||||
case eventlog.ACTION_USER_ADD: return userAdded(source.userId, id, data.user, callback);
|
||||
case eventlog.ACTION_USER_REMOVE: return userRemoved(source.userId, id, data.user, callback);
|
||||
case eventlog.ACTION_USER_UPDATE: return data.adminStatusChanged ? adminChanged(source.userId, id, data.user, callback) : callback();
|
||||
case eventlog.ACTION_APP_OOM: return oomEvent(id, data.app ? data.app.id : data.containerId, { app: data.app, details: data }, callback);
|
||||
case eventlog.ACTION_APP_OOM: return oomEvent(id, data.app, data.addon, data.containerId, data.event, callback);
|
||||
case eventlog.ACTION_APP_DOWN: return appDied(id, data.app, callback);
|
||||
case eventlog.ACTION_APP_UP: return appUp(id, data.app, callback);
|
||||
case eventlog.ACTION_APP_TASK_CRASH: return apptaskCrash(id, data.appId, data.crashLogFile, callback);
|
||||
|
||||
Reference in New Issue
Block a user