diff --git a/src/metrics.js b/src/metrics.js index 61d971c98..b4cb8602a 100644 --- a/src/metrics.js +++ b/src/metrics.js @@ -5,6 +5,7 @@ exports = module.exports = { getSystemStream, getContainer, + getContainerStream, sendToGraphite }; @@ -23,7 +24,28 @@ const apps = require('./apps.js'), services = require('./services.js'), superagent = require('./superagent.js'); -async function getDockerMetrics() { +async function readContainerMetric(name) { + assert.strictEqual(typeof name, 'string'); + + const stats = await docker.getStats(name); + if (Object.keys(stats.memory_stats).length === 0) return null; // the container is stopped. better not to inspect and check State since a race is possible + + const networkRead = stats.networks ? stats.networks.eth0.rx_bytes : 0; // in host mode (turn), networks is missing + const networkWrite = stats.networks ? stats.networks.eth0.tx_bytes : 0; // in host mode (turn), networks is missing + + const memoryUsed = stats.memory_stats.usage; + const memoryMax = stats.memory_stats.limit; + + const blkioStats = stats.blkio_stats.io_service_bytes_recursive; + const blockRead = blkioStats.filter(entry => entry.op === 'read').reduce((sum, entry) => sum + entry.value, 0); + const blockWrite = blkioStats.filter(entry => entry.op === 'write').reduce((sum, entry) => sum + entry.value, 0); + + const cpuUsageMsecs = stats.cpu_stats.cpu_usage.total_usage / 1000; // convert to msecs (to match system metrics) + + return { networkRead, networkWrite, blockRead, blockWrite, memoryUsed, memoryMax, cpuUsageMsecs }; +} + +async function readContainerMetrics() { const allAddons = [ 'turn', 'mail', 'mongodb', 'mysql', 'postgresql' ]; const containerNames = allAddons; @@ -34,28 +56,14 @@ async function getDockerMetrics() { const metrics = {}; for (const containerName of containerNames) { - const stats = await docker.getStats(containerName); - if (Object.keys(stats.memory_stats).length === 0) continue; // the container is stopped. better not to inspect and check State since a race is possible - - const networkRead = stats.networks ? stats.networks.eth0.rx_bytes : 0; // in host mode (turn), networks is missing - const networkWrite = stats.networks ? stats.networks.eth0.tx_bytes : 0; // in host mode (turn), networks is missing - - const memUsed = stats.memory_stats.usage; - const memMax = stats.memory_stats.limit; - - const blkioStats = stats.blkio_stats.io_service_bytes_recursive; - const blockRead = blkioStats.filter(entry => entry.op === 'read').reduce((sum, entry) => sum + entry.value, 0); - const blockWrite = blkioStats.filter(entry => entry.op === 'write').reduce((sum, entry) => sum + entry.value, 0); - - const cpuUsage = stats.cpu_stats.cpu_usage.total_usage / 1000; // convert to msecs (to match system metrics) - - metrics[containerName] = { networkRead, networkWrite, blockRead, blockWrite, memUsed, memMax, cpuUsage }; + const stats = await readContainerMetric(containerName); + if (stats) metrics[containerName] = stats; } return metrics; } -async function getMemoryMetrics() { +async function readMemoryMetrics() { const output = await fs.promises.readFile('/proc/meminfo', { encoding: 'utf8' }); const totalMemoryMatch = output.match(/^MemTotal:\s+(\d+)/m); @@ -80,7 +88,7 @@ async function getMemoryMetrics() { }; } -async function getCpuMetrics() { +async function readCpuMetrics() { const cpus = os.cpus(); const userMsecs = cpus.map(c => c.times.user).reduce((p, c) => p+c); const sysMsecs = cpus.map(c => c.times.sys).reduce((p, c) => p+c); @@ -93,24 +101,24 @@ async function sendToGraphite() { const graphiteMetrics = []; - const memoryMetrics = await getMemoryMetrics(); + const memoryMetrics = await readMemoryMetrics(); graphiteMetrics.push({ path: `cloudron.system.memory-used`, value: memoryMetrics.memoryUsed }); graphiteMetrics.push({ path: `cloudron.system.swap-used`, value: memoryMetrics.swapUsed }); - const cpuMetrics = await getCpuMetrics(); + const cpuMetrics = await readCpuMetrics(); graphiteMetrics.push({ path: `cloudron.system.cpu-user`, value: cpuMetrics.userMsecs }); graphiteMetrics.push({ path: `cloudron.system.cpu-sys`, value: cpuMetrics.sysMsecs }); - const dockerMetrics = await getDockerMetrics(); + const dockerMetrics = await readContainerMetrics(); for (const [name, value] of Object.entries(dockerMetrics)) { graphiteMetrics.push( { path: `cloudron.container-${name}.network-read`, value: value.networkRead }, { path: `cloudron.container-${name}.network-write`, value: value.networkWrite }, { path: `cloudron.container-${name}.blockio-read`, value: value.blockRead }, { path: `cloudron.container-${name}.blockio-write`, value: value.blockWrite }, - { path: `cloudron.container-${name}.mem-used`, value: value.memUsed }, - { path: `cloudron.container-${name}.mem-max`, value: value.memMax }, - { path: `cloudron.container-${name}.cpu-usage`, value: value.cpuUsage }, + { path: `cloudron.container-${name}.memory-used`, value: value.memoryUsed }, + { path: `cloudron.container-${name}.memory-max`, value: value.memoryMax }, + { path: `cloudron.container-${name}.cpu-usage`, value: value.cpuUsageMsecs }, ); } @@ -165,11 +173,15 @@ async function getContainer(name, options) { // (cpu usage msecs) / (cpus * 1000) is the percent but over all cpus. times 100 is the percent. // but the y-scale is cpus times 100. so, we only need to scale by 0.1 `scale(perSecond(cloudron.container-${name}.cpu-usage),0.1)`, - `summarize(cloudron.container-${name}.mem-used, "${intervalSecs}s", "avg")`, - `summarize(cloudron.container-${name}.blockio-read, "${intervalSecs}s", "sum")`, - `summarize(cloudron.container-${name}.blockio-write, "${intervalSecs}s", "sum")`, - `summarize(cloudron.container-${name}.network-read, "${intervalSecs}s", "sum")`, - `summarize(cloudron.container-${name}.network-write, "${intervalSecs}s", "sum")`, + `summarize(cloudron.container-${name}.memory-used, "${intervalSecs}s", "avg")`, + + // get the rate in interval window + `summarize(perSecond(cloudron.container-${name}.blockio-read), "${intervalSecs}s", "avg")`, + `summarize(perSecond(cloudron.container-${name}.blockio-write), "${intervalSecs}s", "avg")`, + `summarize(perSecond(cloudron.container-${name}.network-read), "${intervalSecs}s", "avg")`, + `summarize(perSecond(cloudron.container-${name}.network-write), "${intervalSecs}s", "avg")`, + + // just get the max in interval window for absolute numbers `summarize(cloudron.container-${name}.blockio-read, "${intervalSecs}s", "max")`, `summarize(cloudron.container-${name}.blockio-write, "${intervalSecs}s", "max")`, `summarize(cloudron.container-${name}.network-read, "${intervalSecs}s", "max")`, @@ -292,8 +304,11 @@ async function getSystemStream(options) { }); intervalId = setInterval(async () => { - const memoryMetrics = await getMemoryMetrics(); - const cpuMetrics = await getCpuMetrics(); + const [memoryResult, cpuResult] = await Promise.allSettled([ readMemoryMetrics(), readCpuMetrics() ]); // never throws + if (memoryResult.status !== 'fulfilled' || cpuMetrics.status !== 'fulfilled') return metricsStream.destroy(memoryResult.reason || cpuMetrics.reason); + + const memoryMetrics = memoryResult.value; + const cpuMetrics = cpuResult.value; const cpuPercent = oldCpuMetrics ? (cpuMetrics.userMsecs + cpuMetrics.sysMsecs - oldCpuMetrics.userMsecs - oldCpuMetrics.sysMsecs) * 0.1 / (INTERVAL_MSECS/1000) : null; oldCpuMetrics = cpuMetrics; @@ -308,3 +323,54 @@ async function getSystemStream(options) { return metricsStream; } + +async function getContainerStream(name, options) { + assert.strictEqual(typeof name, 'string'); + assert.strictEqual(typeof options, 'object'); + + const INTERVAL_MSECS = options.intervalMsecs || 5000; + let intervalId = null, oldMetrics = null; + + const metricsStream = new Readable({ + read(/*size*/) { /* ignored, we push via interval */ }, + destroy(error, callback) { + clearInterval(intervalId); + callback(error); + } + }); + + intervalId = setInterval(async () => { + const [error, metrics] = await safe(readContainerMetric(name)); + if (error) return metricsStream.destroy(error); + + const { networkRead, networkWrite, blockRead, blockWrite, memoryUsed, cpuUsageMsecs } = metrics; + + const cpuPercent = oldMetrics ? (cpuUsageMsecs - oldMetrics.cpuUsageMsecs) * 0.1 / (INTERVAL_MSECS/1000) : null; + const blockReadRate = oldMetrics ? (blockRead - oldMetrics.blockRead) / (INTERVAL_MSECS/1000) : null; + const blockWriteRate = oldMetrics ? (blockWrite - oldMetrics.blockWrite) / (INTERVAL_MSECS/1000) : null; + const networkReadRate = oldMetrics ? (networkRead - oldMetrics.networkRead) / (INTERVAL_MSECS/1000) : null; + const networkWriteRate = oldMetrics ? (networkWrite - oldMetrics.networkWrite) / (INTERVAL_MSECS/1000) : null; + + oldMetrics = metrics; + + const now = Date.now() / 1000; + metricsStream.push(JSON.stringify({ + cpu: [ cpuPercent, now ], + memory: [ memoryUsed, now ], + + blockRead: [ blockReadRate, now ], + blockWrite: [ blockWriteRate, now ], + + networkRead: [ networkReadRate, now ], + networkWrite: [ networkWriteRate, now ], + + blockReadTotal: [ blockRead, now ], + blockWriteTotal: [ blockWrite, now ], + networkReadTotal: [ networkRead, now ], + networkWriteTotal: [ networkWrite, now ], + cpuCount: os.cpus().length + })); + }, INTERVAL_MSECS); + + return metricsStream; +} diff --git a/src/routes/apps.js b/src/routes/apps.js index f697e852e..e8e5350ae 100644 --- a/src/routes/apps.js +++ b/src/routes/apps.js @@ -66,6 +66,7 @@ exports = module.exports = { downloadBackup, getMetrics, + getMetricStream, load }; @@ -1080,3 +1081,30 @@ async function getMetrics(req, res, next) { next(new HttpSuccess(200, result)); } + +async function getMetricStream(req, res, next) { + if (req.headers.accept !== 'text/event-stream') return next(new HttpError(400, 'This API call requires EventStream')); + + const intervalMsecs = typeof req.query.intervalMsecs !== 'undefined' ? parseInt(req.query.intervalMsecs, 10) : 5000; + if (!intervalMsecs || intervalMsecs < 100) return next(new HttpError(400, 'intervalSecs query param must be atleast 100')); + + const [error, metricStream] = await safe(metrics.getContainerStream(req.resources.app.id, { intervalMsecs })); + if (error) return next(BoxError.toHttpError(error)); + + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'X-Accel-Buffering': 'no', // disable nginx buffering + 'Access-Control-Allow-Origin': '*' + }); + res.write('retry: 3000\n'); + res.on('close', () => metricStream.destroy()); + metricStream.on('data', function (data) { + const obj = JSON.parse(data); + const sse = `data: ${JSON.stringify(obj)}\n\n`; + res.write(sse); + }); + metricStream.on('end', res.end.bind(res)); + metricStream.on('error', res.end.bind(res, null)); +} diff --git a/src/server.js b/src/server.js index 7a3ae6bbe..f3f2a4b77 100644 --- a/src/server.js +++ b/src/server.js @@ -300,6 +300,7 @@ async function initializeExpressSync() { router.get ('/api/v1/apps/:id/eventlog', token, routes.apps.load, authorizeOperator, routes.apps.listEventlog); router.get ('/api/v1/apps/:id/task', token, routes.apps.load, authorizeOperator, routes.apps.getTask); router.get ('/api/v1/apps/:id/metrics', token, routes.apps.load, authorizeOperator, routes.apps.getMetrics); + router.get ('/api/v1/apps/:id/metricstream', token, routes.apps.load, authorizeOperator, routes.apps.getMetricStream); router.post('/api/v1/apps/:id/clone', json, token, routes.apps.load, authorizeAdmin, routes.apps.clone); router.use ('/api/v1/apps/:id/files/*filepath', token, routes.apps.load, authorizeOperator, routes.filemanager.proxy('app')); router.post('/api/v1/apps/:id/exec', json, token, routes.apps.load, authorizeOperator, routes.apps.createExec);