'use strict'; exports = module.exports = { getSystem, getSystemStream, getContainer, getContainerStream, sendToGraphite }; const apps = require('./apps.js'), assert = require('assert'), BoxError = require('./boxerror.js'), constants = require('./constants.js'), debug = require('debug')('box:metrics'), docker = require('./docker.js'), fs = require('fs'), net = require('net'), os = require('os'), path = require('path'), { Readable } = require('stream'), safe = require('safetydance'), services = require('./services.js'), superagent = require('./superagent.js'); function translateContainerStatsSync(stats) { assert.strictEqual(typeof stats, 'object'); const networkRead = stats.networks ? stats.networks.eth0.rx_bytes : 0; // in host mode (turn), networks is missing const networkWrite = stats.networks ? stats.networks.eth0.tx_bytes : 0; // in host mode (turn), networks is missing const memoryUsed = stats.memory_stats.usage; const memoryMax = stats.memory_stats.limit; const blkioStats = stats.blkio_stats.io_service_bytes_recursive; const blockRead = blkioStats.filter(entry => entry.op === 'read').reduce((sum, entry) => sum + entry.value, 0); const blockWrite = blkioStats.filter(entry => entry.op === 'write').reduce((sum, entry) => sum + entry.value, 0); const cpuUsageMsecs = stats.cpu_stats.cpu_usage.total_usage / 1e6; // convert from nano to msecs (to match system metrics) const systemUsageMsecs = stats.cpu_stats.system_cpu_usage / 1e6; return { ts: new Date(stats.read), networkRead, networkWrite, blockRead, blockWrite, memoryUsed, memoryMax, cpuUsageMsecs, systemUsageMsecs }; } async function readContainerMetrics() { const allAddons = [ 'turn', 'mail', 'mongodb', 'mysql', 'postgresql' ]; const containerNames = allAddons; for (const app of await apps.list()) { if (app.containerId) containerNames.push(app.containerId); // containerId can be null if app is installing if (app.manifest.addons?.redis && app.enableRedis) containerNames.push(`redis-${app.id}`); } const metrics = {}; for (const containerName of containerNames) { const [error, stats] = await safe(docker.getStats(containerName, { stream: false })); if (error || Object.keys(stats.memory_stats).length === 0) continue; // the container is missing or stopped. better not to inspect and check State since a race is possible metrics[containerName] = translateContainerStatsSync(stats); } return metrics; } async function readMemoryMetrics() { const output = await fs.promises.readFile('/proc/meminfo', { encoding: 'utf8' }); const totalMemoryMatch = output.match(/^MemTotal:\s+(\d+)/m); const freeMemoryMatch = output.match(/^MemFree:\s+(\d+)/m); const buffersMatch = output.match(/^Buffers:\s+(\d+)/m); const cachedMatch = output.match(/^Cached:\s+(\d+)/m); if (!totalMemoryMatch || !freeMemoryMatch || !buffersMatch || !cachedMatch) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not find memory used'); const memoryUsed = parseInt(totalMemoryMatch[1]) * 1024 - parseInt(freeMemoryMatch[1]) * 1024 - parseInt(buffersMatch[1]) * 1024 - parseInt(cachedMatch[1]) * 1024; const swapTotalMatch = output.match(/^SwapTotal:\s+(\d+)/m); const swapFreeMatch = output.match(/^SwapFree:\s+(\d+)/m); if (!swapTotalMatch || !swapFreeMatch) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not find swap used'); const swapUsed = parseInt(swapTotalMatch[1]) * 1024 - parseInt(swapFreeMatch[1]) * 1024; return { memoryUsed, swapUsed }; } async function readCpuMetrics() { const cpus = os.cpus(); const userMsecs = cpus.map(c => c.times.user).reduce((p, c) => p+c); const sysMsecs = cpus.map(c => c.times.sys).reduce((p, c) => p+c); return { userMsecs, sysMsecs }; // these values are the times spent since system start } async function readDiskMetrics() { const mounts = await fs.promises.readFile('/proc/mounts', { encoding: 'utf8' }); const rootLine = mounts.split('\n').find(line => line.split(' ')[1] === '/'); if (!rootLine) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Root mount not found'); const devicePath = rootLine.split(' ')[0]; // e.g., "/dev/sda1" const base = path.basename(devicePath); // remove /dev/ const match = base.match(/^(.*?)(p?[0-9]+)?$/); const blockDevice = match ? match[1] : base; if (!blockDevice) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not find root block device name'); const diskstats = await fs.promises.readFile('/proc/diskstats', { encoding: 'utf8' }); const statsLine = diskstats.split('\n').find(l => l.includes(` ${blockDevice} `)); if (!blockDevice) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not get disk stats'); const parts = statsLine.trim().split(/\s+/); const sectorsRead = parseInt(parts[5], 10); // field 6 . one sectiro is 512 bytes const sectorsWrite = parseInt(parts[9], 10); // field 10 const blockRead = sectorsRead * 512; const blockWrite = sectorsWrite * 512; return { blockRead, blockWrite }; } async function readNetworkMetrics() { const contents = await fs.promises.readFile('/proc/net/route', { encoding: 'utf8' }); const lines = contents.trim().split('\n').slice(1); // skip header let defaultIface = null; for (const line of lines) { const [iface, destination] = line.split(/\s+/); if (destination === '00000000') { defaultIface = iface; // default route break; } } if (!defaultIface) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not detect default interface'); const [rx, tx] = await Promise.all([ fs.promises.readFile(`/sys/class/net/${defaultIface}/statistics/rx_bytes`, { encoding: 'utf8' }), fs.promises.readFile(`/sys/class/net/${defaultIface}/statistics/tx_bytes`, { encoding: 'utf8' }) ]); return { networkRead: parseInt(rx.trim(), 10), networkWrite: parseInt(tx.trim(), 10) }; } async function readSystemMetrics() { const memoryMetrics = await readMemoryMetrics(); const cpuMetrics = await readCpuMetrics(); const diskMetrics = await readDiskMetrics(); const networkMetrics = await readNetworkMetrics(); // { memoryUsed, swapUsed, userMsecs, sysMsecs, blockRead, blockWrite, networkRead, networkWrite } return { ...memoryMetrics, ...cpuMetrics, ...diskMetrics, ...networkMetrics }; } async function sendToGraphite() { // debug('sendStatsToGraphite: collecting stats'); const result = await readSystemMetrics(); const graphiteMetrics = [ { path: `cloudron.system.memory-used`, value: result.memoryUsed }, { path: `cloudron.system.swap-used`, value: result.swapUsed }, { path: `cloudron.system.cpu-user`, value: result.userMsecs }, { path: `cloudron.system.cpu-sys`, value: result.sysMsecs }, { path: `cloudron.system.blockio-read`, value: result.blockRead }, { path: `cloudron.system.blockio-write`, value: result.blockWrite }, { path: `cloudron.system.network-read`, value: result.networkRead }, { path: `cloudron.system.network-write`, value: result.networkWrite } ]; const dockerMetrics = await readContainerMetrics(); for (const [name, value] of Object.entries(dockerMetrics)) { graphiteMetrics.push( { path: `cloudron.container-${name}.network-read`, value: value.networkRead }, { path: `cloudron.container-${name}.network-write`, value: value.networkWrite }, { path: `cloudron.container-${name}.blockio-read`, value: value.blockRead }, { path: `cloudron.container-${name}.blockio-write`, value: value.blockWrite }, { path: `cloudron.container-${name}.memory-used`, value: value.memoryUsed }, { path: `cloudron.container-${name}.memory-max`, value: value.memoryMax }, { path: `cloudron.container-${name}.cpu-usage`, value: value.cpuUsageMsecs }, ); } return new Promise((resolve) => { const client = new net.Socket(); client.connect(constants.GRAPHITE_PORT, '127.0.0.1', () => { debug('connected to graphite'); const now = Math.floor(Date.now() / 1000); for (const metric of graphiteMetrics) { client.write(`${metric.path} ${metric.value} ${now}\n`); } client.end(); }); client.on('error', (error) => { debug(error); resolve(); }); client.on('end', () => { debug('sent to graphite'); resolve(); }); }); } // for testing locally: curl 'http://${graphite-ip}:8000/graphite-web/render?format=json&from=-1min&target=absolute(collectd.localhost.du-docker.capacity-usage)' // the datapoint is (value, timestamp) https://graphite.readthedocs.io/en/latest/ async function getGraphiteUrl() { const [error, result] = await safe(docker.inspect('graphite')); if (error && error.reason === BoxError.NOT_FOUND) return { status: exports.SERVICE_STATUS_STOPPED }; if (error) throw error; const ip = safe.query(result, 'NetworkSettings.Networks.cloudron.IPAddress', null); if (!ip) throw new BoxError(BoxError.INACTIVE, 'Error getting IP of graphite service'); return `http://${ip}:8000/graphite-web/render`; } async function getContainer(name, options) { assert.strictEqual(typeof name, 'string'); assert.strictEqual(typeof options, 'object'); const { fromSecs, intervalSecs, noNullPoints } = options; const graphiteUrl = await getGraphiteUrl(); const targets = [ // perSecond is nonNegativeDerivative over time . this value is the cpu usage in msecs . // (cpu usage msecs) / (cpus * 1000) is the percent but over all cpus. times 100 is the percent. // but the y-scale is cpus times 100. so, we only need to scale by 0.1 `scale(perSecond(cloudron.container-${name}.cpu-usage),0.1)`, `summarize(cloudron.container-${name}.memory-used, "${intervalSecs}s", "avg")`, // get the rate in interval window `summarize(perSecond(cloudron.container-${name}.blockio-read), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.container-${name}.blockio-write), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.container-${name}.network-read), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.container-${name}.network-write), "${intervalSecs}s", "avg")`, // just get the max in interval window for absolute numbers `summarize(cloudron.container-${name}.blockio-read, "${intervalSecs}s", "max")`, `summarize(cloudron.container-${name}.blockio-write, "${intervalSecs}s", "max")`, `summarize(cloudron.container-${name}.network-read, "${intervalSecs}s", "max")`, `summarize(cloudron.container-${name}.network-write, "${intervalSecs}s", "max")`, ]; const results = []; for (const target of targets) { const query = { target: target, format: 'json', from: `-${fromSecs}s`, until: 'now', noNullPoints: !!noNullPoints }; const [error, response] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).ok(() => true)); if (error) throw new BoxError(BoxError.NETWORK_ERROR, error); if (response.status !== 200) throw new BoxError(BoxError.EXTERNAL_ERROR, `Unknown error with ${target}: ${response.status} ${response.text}`); results.push(response.body[0] && response.body[0].datapoints ? response.body[0].datapoints : []); } // results are datapoints[[value, ts], [value, ts], ...]; return { cpu: results[0], memory: results[1], blockReadRate: results[2], blockWriteRate: results[3], networkReadRate: results[4], networkWriteRate: results[5], blockReadTotal: results[6][0] && results[6][0][0] ? results[6][0][0] : 0, blockWriteTotal: results[7][0] && results[7][0][0] ? results[7][0][0] : 0, networkReadTotal: results[8][0] && results[8][0][0] ? results[8][0][0] : 0, networkWriteTotal: results[9][0] && results[9][0][0] ? results[9][0][0] : 0, }; } async function readSystemFromGraphite(options) { assert.strictEqual(typeof options, 'object'); const { fromSecs, intervalSecs, noNullPoints } = options; const graphiteUrl = await getGraphiteUrl(); // example: curl 'http://172.18.30.5:8000/graphite-web/render?target=cloudron.system.cpu-user&target=cloudron.system.cpu-sys&format=json&from=-1min&until=now&noNullPoints=false' | python3 -m json.tool const targets = [ // perSecond is nonNegativeDerivative over time . this value is the cpu usage in msecs . // (cpu usage msecs) / (cpus * 1000) is the percent but over all cpus. times 100 is the percent. // but the y-scale is cpus times 100. so, we only need to scale by 0.1 `scale(perSecond(sumSeries(cloudron.system.cpu-user,cloudron.system.cpu-sys)),0.1)`, `summarize(cloudron.system.memory-used, "${intervalSecs}s", "avg")`, `summarize(cloudron.system.swap-used, "${intervalSecs}s", "avg")`, // get the rate in interval window `summarize(perSecond(cloudron.system.blockio-read), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.system.blockio-write), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.system.network-read), "${intervalSecs}s", "avg")`, `summarize(perSecond(cloudron.system.network-write), "${intervalSecs}s", "avg")`, // just get the max in interval window for absolute numbers `summarize(cloudron.system.blockio-read, "${intervalSecs}s", "max")`, `summarize(cloudron.system.blockio-write, "${intervalSecs}s", "max")`, `summarize(cloudron.system.network-read, "${intervalSecs}s", "max")`, `summarize(cloudron.system.network-write, "${intervalSecs}s", "max")`, ]; const results = []; for (const target of targets) { const query = { target: target, format: 'json', from: `-${fromSecs}s`, until: 'now', noNullPoints: !!noNullPoints }; const [error, response] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).ok(() => true)); if (error) throw new BoxError(BoxError.NETWORK_ERROR, error); if (response.status !== 200) throw new BoxError(BoxError.EXTERNAL_ERROR, `Unknown error with ${target}: ${response.status} ${response.text}`); results.push(response.body[0] && response.body[0].datapoints ? response.body[0].datapoints : []); } return { cpu: results[0], memory: results[1], swap: results[2], blockReadRate: results[3], blockWriteRate: results[4], networkReadRate: results[5], networkWriteRate: results[6], blockReadTotal: results[7][0] && results[7][0][0] ? results[7][0][0] : 0, blockWriteTotal: results[8][0] && results[8][0][0] ? results[8][0][0] : 0, networkReadTotal: results[9][0] && results[9][0][0] ? results[9][0][0] : 0, networkWriteTotal: results[10][0] && results[10][0][0] ? results[10][0][0] : 0, }; } // CPU: stress --cpu 2 --timeout 60 async function getSystem(options) { assert.strictEqual(typeof options, 'object'); const systemStats = await readSystemFromGraphite(options); const appStats = {}; for (const app of await apps.list()) { appStats[app.id] = await getContainer(app.id, options); } const serviceStats = {}; for (const serviceId of await services.listServices()) { serviceStats[serviceId] = await getContainer(serviceId, options); } return { ...systemStats, // { cpu, memory, swap, block{Read,Write}{Rate,Total}, network{Read,Write}{Rate,Total} apps: appStats, services: serviceStats, }; } async function getSystemStream(options) { assert.strictEqual(typeof options, 'object'); const intervalMsecs = options.intervalMsecs || 5000; let intervalId = null, oldMetrics = null; const metricsStream = new Readable({ read(/*size*/) { /* ignored, we push via interval */ }, destroy(error, callback) { clearInterval(intervalId); callback(error); } }); intervalId = setInterval(async () => { const [error, metrics] = await safe(readSystemMetrics()); if (error) return metricsStream.destroy(error); const cpuPercent = oldMetrics ? (metrics.userMsecs + metrics.sysMsecs - oldMetrics.userMsecs - oldMetrics.sysMsecs) * 100 / intervalMsecs : null; const blockReadRate = oldMetrics ? (metrics.blockRead - oldMetrics.blockRead) / (intervalMsecs/1000) : null; const blockWriteRate = oldMetrics ? (metrics.blockWrite - oldMetrics.blockWrite) / (intervalMsecs/1000) : null; const networkReadRate = oldMetrics ? (metrics.networkRead - oldMetrics.networkRead) / (intervalMsecs/1000) : null; const networkWriteRate = oldMetrics ? (metrics.networkWrite - oldMetrics.networkWrite) / (intervalMsecs/1000) : null; oldMetrics = metrics; const nowSecs = Date.now() / 1000; // to match graphite return value metricsStream.push(JSON.stringify({ cpu: [ cpuPercent, nowSecs ], memory: [ metrics.memoryUsed, nowSecs ], swap: [ metrics.swapUsed, nowSecs ], blockReadRate: [ blockReadRate, nowSecs ], blockWriteRate: [ blockWriteRate, nowSecs ], blockReadTotal: metrics.blockRead, blockWriteTotal: metrics.blockWrite, networkReadRate: [ networkReadRate, nowSecs ], networkWriteRate: [ networkWriteRate, nowSecs ], networkReadTotal: metrics.networkRead, networkWriteTotal: metrics.networkWrite, })); }, intervalMsecs); return metricsStream; } async function getContainerStream(name, options) { assert.strictEqual(typeof name, 'string'); assert.strictEqual(typeof options, 'object'); let oldMetrics = null; const metricsStream = new Readable({ read(/*size*/) { /* ignored, we push via interval */ }, destroy(error, callback) { statsStream.destroy(); // double destroy is a no-op callback(error); } }); const [error, statsStream] = await safe(docker.getStats(name, { stream: true })); if (error) throw new Error(`Container stopped or missing: ${error.message}`); statsStream.on('error', (error) => metricsStream.destroy(error)); // double destroy is a no-op statsStream.on('data', (data) => { const stats = JSON.parse(data.toString('utf8')); const metrics = translateContainerStatsSync(stats); const { ts, networkRead, networkWrite, blockRead, blockWrite, memoryUsed, cpuUsageMsecs } = metrics; const gap = oldMetrics ? (ts - oldMetrics.ts) : null; const cpuPercent = oldMetrics ? (cpuUsageMsecs - oldMetrics.cpuUsageMsecs) * 100 / gap : null; const blockReadRate = oldMetrics ? (blockRead - oldMetrics.blockRead) / (gap/1000) : null; const blockWriteRate = oldMetrics ? (blockWrite - oldMetrics.blockWrite) / (gap/1000) : null; const networkReadRate = oldMetrics ? (networkRead - oldMetrics.networkRead) / (gap/1000) : null; const networkWriteRate = oldMetrics ? (networkWrite - oldMetrics.networkWrite) / (gap/1000) : null; oldMetrics = metrics; const nowSecs = ts.getTime() / 1000; // to match graphite return value metricsStream.push(JSON.stringify({ cpu: [ cpuPercent, nowSecs ], memory: [ memoryUsed, nowSecs ], blockReadRate: [ blockReadRate, nowSecs ], blockWriteRate: [ blockWriteRate, nowSecs ], blockReadTotal: metrics.blockRead, blockWriteTotal: metrics.blockWrite, networkReadRate: [ networkReadRate, nowSecs ], networkWriteRate: [ networkWriteRate, nowSecs ], networkReadTotal: metrics.networkRead, networkWriteTotal: metrics.networkWrite, })); }); return metricsStream; }