diff --git a/CHANGES b/CHANGES index b5dd59864..93ea8b1f3 100644 --- a/CHANGES +++ b/CHANGES @@ -2943,4 +2943,5 @@ * sshfs: implement rm via ssh * multiple docker registries * mail: rename delivered -> sent and received -> saved in event log +* graphs: replace collectd with custom collector diff --git a/scripts/init-ubuntu.sh b/scripts/init-ubuntu.sh index 022a4a4d7..e2c8d0c0d 100755 --- a/scripts/init-ubuntu.sh +++ b/scripts/init-ubuntu.sh @@ -133,27 +133,6 @@ apt-get -y --no-upgrade --no-install-recommends install grub2-common sed -e 's/^GRUB_CMDLINE_LINUX="\(.*\)"$/GRUB_CMDLINE_LINUX="\1 cgroup_enable=memory swapaccount=1 panic_on_oops=1 panic=5"/' -i /etc/default/grub update-grub -echo "==> Install collectd" -# without this, libnotify4 will install gnome-shell -apt-get install -y libnotify4 libcurl3-gnutls --no-install-recommends -# https://bugs.launchpad.net/ubuntu/+source/collectd/+bug/1872281 -if [[ "${ubuntu_version}" == "22.04" ]]; then - readonly launchpad="https://launchpad.net/ubuntu/+source/collectd/5.12.0-9/+build/23189375/+files" - cd /tmp && wget -q "${launchpad}/collectd_5.12.0-9_amd64.deb" "${launchpad}/collectd-utils_5.12.0-9_amd64.deb" "${launchpad}/collectd-core_5.12.0-9_amd64.deb" "${launchpad}/libcollectdclient1_5.12.0-9_amd64.deb" - cd /tmp && apt install -y --no-install-recommends ./libcollectdclient1_5.12.0-9_amd64.deb ./collectd-core_5.12.0-9_amd64.deb ./collectd_5.12.0-9_amd64.deb ./collectd-utils_5.12.0-9_amd64.deb && rm -f /tmp/collectd_*.deb - echo -e "\nLD_PRELOAD=/usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so" >> /etc/default/collectd -else - if ! apt-get install -y --no-install-recommends collectd collectd-utils; then - # FQDNLookup is true in default debian config. The box code has a custom collectd.conf that fixes this - echo "Failed to install collectd, continuing anyway. Presumably because of http://mailman.verplant.org/pipermail/collectd/2015-March/006491.html" - fi - - if [[ "${ubuntu_version}" == "20.04" ]]; then - echo -e "\nLD_PRELOAD=/usr/lib/python3.8/config-3.8-x86_64-linux-gnu/libpython3.8.so" >> /etc/default/collectd - fi -fi -sed -e 's/^FQDNLookup true/FQDNLookup false/' -i /etc/collectd/collectd.conf - # some hosts like atlantic install ntp which conflicts with timedatectl. https://serverfault.com/questions/1024770/ubuntu-20-04-time-sync-problems-and-possibly-incorrect-status-information echo "==> Configuring host" sed -e 's/^#NTP=/NTP=0.ubuntu.pool.ntp.org 1.ubuntu.pool.ntp.org 2.ubuntu.pool.ntp.org 3.ubuntu.pool.ntp.org/' -i /etc/systemd/timesyncd.conf diff --git a/setup/start.sh b/setup/start.sh index 04404c7bb..bea433ac0 100755 --- a/setup/start.sh +++ b/setup/start.sh @@ -62,9 +62,9 @@ mkdir -p "${BOX_DATA_DIR}" "${APPS_DATA_DIR}" "${MAIL_DATA_DIR}" # keep these in sync with paths.js log "Ensuring directories" -mkdir -p "${PLATFORM_DATA_DIR}/"{graphite,mysql,postgresql,mongodb,redis,tls,collectd,logrotate.d,acme,backup,update,firewall,sshfs,cifs,oidc,diskusage} +mkdir -p "${PLATFORM_DATA_DIR}/"{graphite,mysql,postgresql,mongodb,redis,tls,logrotate.d,acme,backup,update,firewall,sshfs,cifs,oidc,diskusage} mkdir -p "${PLATFORM_DATA_DIR}/addons/mail/"{banner,dkim} -mkdir -p "${PLATFORM_DATA_DIR}/logs/"{backup,updater,tasks,collectd} +mkdir -p "${PLATFORM_DATA_DIR}/logs/"{backup,updater,tasks} mkdir -p "${PLATFORM_DATA_DIR}/sftp/ssh" # sftp keys # ensure backups folder exists and is writeable @@ -126,11 +126,11 @@ log "Configuring sudoers" rm -f /etc/sudoers.d/${USER} /etc/sudoers.d/cloudron cp "${script_dir}/start/sudoers" /etc/sudoers.d/cloudron -log "Configuring collectd" -rm -rf /etc/collectd /var/log/collectd.log "${PLATFORM_DATA_DIR}/collectd/collectd.conf.d" -ln -sfF "${PLATFORM_DATA_DIR}/collectd" /etc/collectd -cp "${script_dir}/start/collectd/collectd.conf" "${PLATFORM_DATA_DIR}/collectd/collectd.conf" -systemctl restart collectd +# can be removed after 9.0 +log "Unconfiguring collectd" +rm -rf "${PLATFORM_DATA_DIR}/collectd" +systemctl disable collectd || true +systemctl stop collectd || true log "Configuring logrotate" if ! grep -q "^include ${PLATFORM_DATA_DIR}/logrotate.d" /etc/logrotate.conf; then @@ -218,7 +218,7 @@ log "Changing ownership" # note, change ownership after db migrate. this allow db migrate to move files around as root and then we can fix it up here # be careful of what is chown'ed here. subdirs like mysql,redis etc are owned by the containers and will stop working if perms change chown -R "${USER}" /etc/cloudron -chown "${USER}:${USER}" -R "${PLATFORM_DATA_DIR}/"{nginx,collectd,addons,acme,backup,logs,update,sftp,firewall,sshfs,cifs,tls,oidc,diskusage} +chown "${USER}:${USER}" -R "${PLATFORM_DATA_DIR}/"{nginx,addons,acme,backup,logs,update,sftp,firewall,sshfs,cifs,tls,oidc,diskusage} chown "${USER}:${USER}" "${PLATFORM_DATA_DIR}/INFRA_VERSION" 2>/dev/null || true chown "${USER}:${USER}" "${PLATFORM_DATA_DIR}" chown "${USER}:${USER}" "${APPS_DATA_DIR}" diff --git a/setup/start/collectd/collectd.conf b/setup/start/collectd/collectd.conf deleted file mode 100644 index 708a09daa..000000000 --- a/setup/start/collectd/collectd.conf +++ /dev/null @@ -1,229 +0,0 @@ -# Config file for collectd(1). -# -# Some plugins need additional configuration and are disabled by default. -# Please read collectd.conf(5) for details. -# -# You should also read /usr/share/doc/collectd-core/README.Debian.plugins -# before enabling any more plugins. - -############################################################################## -# Global # -#----------------------------------------------------------------------------# -# Global settings for the daemon. # -############################################################################## - -Hostname "localhost" -FQDNLookup false -#BaseDir "/var/lib/collectd" -#PluginDir "/usr/lib/collectd" -#TypesDB "/usr/share/collectd/types.db" "/etc/collectd/my_types.db" - -#----------------------------------------------------------------------------# -# When enabled, plugins are loaded automatically with the default options # -# when an appropriate block is encountered. # -# Disabled by default. # -#----------------------------------------------------------------------------# -#AutoLoadPlugin false - -#----------------------------------------------------------------------------# -# Interval at which to query values. This may be overwritten on a per-plugin # -# base by using the 'Interval' option of the LoadPlugin block: # -# # -# Interval 60 # -# # -#----------------------------------------------------------------------------# -# IMPORTANT: changing this value requires a change in whisper schema as well -Interval 20 - -#Timeout 2 -#ReadThreads 5 -#WriteThreads 5 - -# Limit the size of the write queue. Default is no limit. Setting up a limit -# is recommended for servers handling a high volume of traffic. -#WriteQueueLimitHigh 1000000 -#WriteQueueLimitLow 800000 - -############################################################################## -# Logging # -#----------------------------------------------------------------------------# -# Plugins which provide logging functions should be loaded first, so log # -# messages generated when loading or configuring other plugins can be # -# accessed. # -############################################################################## - -LoadPlugin logfile -#LoadPlugin syslog - - - LogLevel "info" - File "/home/yellowtent/platformdata/logs/collectd/collectd.log" - Timestamp true - PrintSeverity false - - -# -# LogLevel info -# - -############################################################################## -# LoadPlugin section # -#----------------------------------------------------------------------------# -# Specify what features to activate. # -############################################################################## - -LoadPlugin aggregation -#LoadPlugin amqp -#LoadPlugin apache -#LoadPlugin apcups -#LoadPlugin ascent -#LoadPlugin battery -#LoadPlugin bind -#LoadPlugin cgroups -#LoadPlugin conntrack -#LoadPlugin contextswitch -LoadPlugin cpu -#LoadPlugin cpufreq -#LoadPlugin csv -#LoadPlugin curl -#LoadPlugin curl_json -#LoadPlugin curl_xml -#LoadPlugin dbi -#LoadPlugin df -#LoadPlugin disk -#LoadPlugin dns -#LoadPlugin email -#LoadPlugin entropy -#LoadPlugin ethstat -#LoadPlugin exec -#LoadPlugin filecount -#LoadPlugin fscache -#LoadPlugin gmond -#LoadPlugin hddtemp -LoadPlugin interface -#LoadPlugin ipmi -#LoadPlugin iptables -#LoadPlugin ipvs -#LoadPlugin irq -#LoadPlugin java -#LoadPlugin libvirt -LoadPlugin load -#LoadPlugin lvm -#LoadPlugin madwifi -#LoadPlugin mbmon -#LoadPlugin md -#LoadPlugin memcachec -#LoadPlugin memcached -LoadPlugin memory -#LoadPlugin modbus -#LoadPlugin multimeter -#LoadPlugin mysql -#LoadPlugin netlink -#LoadPlugin network -#LoadPlugin nfs -#LoadPlugin nginx -#LoadPlugin notify_desktop -#LoadPlugin notify_email -#LoadPlugin ntpd -#LoadPlugin numa -#LoadPlugin nut -#LoadPlugin olsrd -#LoadPlugin openvpn -# -# Globals true -# -#LoadPlugin pinba -#LoadPlugin ping -#LoadPlugin postgresql -#LoadPlugin powerdns -#LoadPlugin processes -#LoadPlugin protocols - - Globals true - -#LoadPlugin rrdcached -#LoadPlugin rrdtool -#LoadPlugin sensors -#LoadPlugin serial -#LoadPlugin snmp -#LoadPlugin statsd -LoadPlugin swap -#LoadPlugin table -#LoadPlugin tail -#LoadPlugin tail_csv -#LoadPlugin tcpconns -#LoadPlugin teamspeak2 -#LoadPlugin ted -#LoadPlugin thermal -#LoadPlugin tokyotyrant -#LoadPlugin unixsock -#LoadPlugin uptime -#LoadPlugin users -#LoadPlugin uuid -#LoadPlugin varnish -#LoadPlugin vmem -#LoadPlugin vserver -#LoadPlugin wireless - - FlushInterval 20 - -#LoadPlugin write_http -#LoadPlugin write_riemann - -############################################################################## -# Plugin configuration # -#----------------------------------------------------------------------------# -# In this section configuration stubs for each plugin are provided. A desc- # -# ription of those options is available in the collectd.conf(5) manual page. # -############################################################################## - - - - Plugin "cpu" - Type "cpu" - - GroupBy "Host" - GroupBy "TypeInstance" - - CalculateNum false - CalculateSum true - CalculateAverage false - CalculateMinimum false - CalculateMaximum false - CalculateStddev false - - - - - Interface "eth0" - IgnoreSelected false - - - - ReportByDevice false - ReportBytes true - - - - # https://blog.dbrgn.ch/2017/3/10/write-a-collectd-python-plugin/ - ModulePath "/home/yellowtent/box/setup/start/collectd/" - LogTraces false # enable this to get traces in /var/log/collectd.log - Interactive false - - Import "df" - Import "docker-stats" - - - - - Host "127.0.0.1" - Port "2003" - Protocol "tcp" - LogSendErrors true - Prefix "collectd." - StoreRates true - AlwaysAppendDS false - EscapeCharacter "_" - - - diff --git a/setup/start/collectd/df.py b/setup/start/collectd/df.py deleted file mode 100644 index e044c26ec..000000000 --- a/setup/start/collectd/df.py +++ /dev/null @@ -1,38 +0,0 @@ -import collectd,os,subprocess - -# https://blog.dbrgn.ch/2017/3/10/write-a-collectd-python-plugin/ - -disks = [] - -def init(): - global disks - lines = [s.split() for s in subprocess.check_output(["df", "--type=ext4", "--output=source,target,size,used,avail"]).decode('utf-8').splitlines()] - disks = lines[1:] # strip header - collectd.info('custom df plugin initialized with %s' % disks) - -def read(): - for d in disks: - device = d[0] - if 'devicemapper' in d[1] or not device.startswith('/dev/'): continue - instance = device[len('/dev/'):].replace('/', '_').replace('.', '_') # see #348 - - try: - st = os.statvfs(d[1]) # handle disk removal - except: - continue - - # type comes from https://github.com/collectd/collectd/blob/master/src/types.db - val = collectd.Values(type='df_complex', plugin='df', plugin_instance=instance) - - free = st.f_bavail * st.f_frsize # bavail is for non-root user. bfree is total - val.dispatch(values=[free], type_instance='free') - - reserved = (st.f_bfree - st.f_bavail) * st.f_frsize # root took these - val.dispatch(values=[reserved], type_instance='reserved') - - used = (st.f_blocks - st.f_bfree) * st.f_frsize - val.dispatch(values=[used], type_instance='used') - -collectd.register_init(init) -# see Interval setting in collectd.conf for polling interval -collectd.register_read(read) diff --git a/setup/start/collectd/docker-stats.py b/setup/start/collectd/docker-stats.py deleted file mode 100644 index 80b13c4a1..000000000 --- a/setup/start/collectd/docker-stats.py +++ /dev/null @@ -1,69 +0,0 @@ -import collectd,os,subprocess,json,re - -# https://blog.dbrgn.ch/2017/3/10/write-a-collectd-python-plugin/ - -def parseSiSize(size): - units = {"B": 1, "KB": 10**3, "MB": 10**6, "GB": 10**9, "TB": 10**12} - number, unit, _ = re.split('([a-zA-Z]+)', size.upper()) - return int(float(number)*units[unit]) - -def parseBinarySize(size): - units = {"B": 1, "KIB": 2**10, "MIB": 2**20, "GIB": 2**30, "TIB": 2**40} - number, unit, _ = re.split('([a-zA-Z]+)', size.upper()) - return int(float(number)*units[unit]) - -def init(): - collectd.info('custom docker-status plugin initialized') - -def read(): - try: - lines = subprocess.check_output('docker stats --format "{{ json . }}" --no-stream --no-trunc', shell=True).decode('utf-8').strip().split("\n") - except Exception as e: - collectd.info('\terror getting docker stats: %s' % (str(e))) - return 0 - - # Sample line - # {"BlockIO":"430kB / 676kB","CPUPerc":"0.00%","Container":"7eae5e6f4f11","ID":"7eae5e6f4f11","MemPerc":"59.15%","MemUsage":"45.55MiB / 77MiB","Name":"1062eef3-ec96-4d81-9f02-15b7dd81ccb9","NetIO":"1.5MB / 3.48MB","PIDs":"5"} - - for line in lines: - stat = json.loads(line) - containerName = stat["Name"] # same as app id - - # currently we only collect data for apps main containers. Those have the app id as the Name which is 36 long - if len(containerName) != 36: - continue - - networkData = stat["NetIO"].split("/") - networkRead = parseSiSize(networkData[0].strip()) - networkWrite = parseSiSize(networkData[1].strip()) - - blockData = stat["BlockIO"].split("/") - blockRead = parseSiSize(blockData[0].strip()) - blockWrite = parseSiSize(blockData[1].strip()) - - memUsageData = stat["MemUsage"].split("/") - memUsed = parseBinarySize(memUsageData[0].strip()) - memMax = parseBinarySize(memUsageData[1].strip()) - - cpuPercData = stat["CPUPerc"].strip("%") - cpuPerc = float(cpuPercData) - - # type comes from https://github.com/collectd/collectd/blob/master/src/types.db and https://collectd.org/wiki/index.php/Data_source - val = collectd.Values(type='gauge', plugin='docker-stats', plugin_instance=containerName) - val.dispatch(values=[networkRead], type_instance='network-read') - val.dispatch(values=[networkWrite], type_instance='network-write') - val.dispatch(values=[blockRead], type_instance='blockio-read') - val.dispatch(values=[blockWrite], type_instance='blockio-write') - val.dispatch(values=[memUsed], type_instance='mem-used') - val.dispatch(values=[memMax], type_instance='mem-max') - val.dispatch(values=[cpuPerc], type_instance='cpu-perc') - - val = collectd.Values(type='counter', plugin='docker-stats', plugin_instance=containerName) - val.dispatch(values=[networkRead], type_instance='network-read') - val.dispatch(values=[networkWrite], type_instance='network-write') - val.dispatch(values=[blockRead], type_instance='blockio-read') - val.dispatch(values=[blockWrite], type_instance='blockio-write') - -collectd.register_init(init) -# see Interval setting in collectd.conf for polling interval -collectd.register_read(read) diff --git a/src/collectd.js b/src/collectd.js new file mode 100755 index 000000000..208954cd3 --- /dev/null +++ b/src/collectd.js @@ -0,0 +1,146 @@ +'use strict'; + +exports = module.exports = { + sendMetricsToGraphite +}; + +const BoxError = require('./boxerror.js'), + constants = require('./constants.js'), + debug = require('debug')('box:collectd'), + execSync = require('child_process').execSync, + net = require('net'), + os = require('os'); + +function parseSiSize(size) { + const units = { + "B": 1, + "KB": Math.pow(10, 3), + "MB": Math.pow(10, 6), + "GB": Math.pow(10, 9), + "TB": Math.pow(10, 12) + }; + + const match = size.toUpperCase().match(/^(\d+(\.\d+)?)\s*(\D+)$/); + if (!match) { + throw new Error("Invalid size format"); + } + + const number = parseFloat(match[1]); + const unit = match[3]; + + return Math.floor(number * units[unit]); +} + +function parseBinarySize(size) { + const units = { + "B": 1, + "KIB": Math.pow(2, 10), + "MIB": Math.pow(2, 20), + "GIB": Math.pow(2, 30), + "TIB": Math.pow(2, 40) + }; + + const match = size.toUpperCase().match(/^(\d+(\.\d+)?)\s*(\D+)$/); + if (!match) { + throw new Error("Invalid size format"); + } + + const number = parseFloat(match[1]); + const unit = match[3]; + + return Math.floor(number * units[unit]); +} + +async function getDockerMetrics() { + const lines = execSync('docker stats --format "{{ json . }}" --no-stream --no-trunc', { encoding: 'utf8' }).trim().split('\n'); + + const metrics = []; + + for (const line of lines) { + const stat = JSON.parse(line); + const name = stat.Name; // appid or addon name + + const networkData = stat.NetIO.split('/'); + const networkRead = parseSiSize(networkData[0].trim()); + const networkWrite = parseSiSize(networkData[1].trim()); + + const blockData = stat.BlockIO.split('/'); + const blockRead = parseSiSize(blockData[0].trim()); + const blockWrite = parseSiSize(blockData[1].trim()); + + const memUsageData = stat.MemUsage.split('/'); + const memUsed = parseBinarySize(memUsageData[0].trim()); + const memMax = parseBinarySize(memUsageData[1].trim()); + + const cpuPercData = stat.CPUPerc.trim().replace('%', ''); + const cpuPerc = parseFloat(cpuPercData); + + metrics.push( + { path: `cloudron.container-${name}.network-read`, value: networkRead }, + { path: `cloudron.container-${name}.network-write`, value: networkWrite }, + { path: `cloudron.container-${name}.blockio-read`, value: blockRead }, + { path: `cloudron.container-${name}.blockio-write`, value: blockWrite }, + { path: `cloudron.container-${name}.mem-used`, value: memUsed }, + { path: `cloudron.container-${name}.mem-max`, value: memMax }, + { path: `cloudron.container-${name}.cpu-perc`, value: cpuPerc }, + ); + } + + return metrics; +} + +async function getMemoryMetrics() { + // we can also read /proc/meminfo but complicated to match the 'used' output of free + const output = execSync('free --bytes --wide', { encoding: 'utf8' }).trim(); // --line is not in older ubuntu + const memoryRe = /Mem:\s+(?\d+)\s+(?\d+)\s+(?\d+)\s+(?\d+)\s+(?\d+)\s+(?\d+)\s+(?\d+)/; + + const memory = output.match(memoryRe); + if (!memory) throw new BoxError(BoxError.EXTERNAL_ERROR, 'Could not find memory used'); + + return [ + { path: `cloudron.system.memory-used`, value: memory.groups.used }, + ]; +} + +async function getCpuMetrics() { + const cpus = os.cpus(); + const userMillis = cpus.map(c => c.times.user).reduce((p, c) => p+c); + const systemMillis = cpus.map(c => c.times.sys).reduce((p, c) => p+c); + + // stores percent + return [ + { path: `cloudron.system.cpu-user`, value: userMillis/1000 }, + { path: `cloudron.system.cpu-system`, value: systemMillis/1000 }, + ]; +} + +async function sendMetricsToGraphite() { + debug('sendStatsToGraphite: collecting stats'); + const dockerMetrics = await getDockerMetrics(); + const memoryMetrics = await getMemoryMetrics(); + const cpuMetrics = await getCpuMetrics(); + + return new Promise((resolve, reject) => { + const client = new net.Socket(); + client.connect(constants.GRAPHITE_PORT, '127.0.0.1', () => { + debug('connected to graphite'); + + const now = Math.floor(Date.now() / 1000); + + for (const metric of [...dockerMetrics, ...memoryMetrics, ...cpuMetrics]) { + client.write(`${metric.path} ${metric.value} ${now}\n`); + } + client.end(); + }); + + client.on('error', (error) => { + debug(error); + resolve(); + }); + + client.on('end', () => { + debug('sent to graphite'); + resolve(); + }); + }); +} diff --git a/src/constants.js b/src/constants.js index 67f62ce82..3eba32d1c 100644 --- a/src/constants.js +++ b/src/constants.js @@ -25,6 +25,7 @@ exports = module.exports = { DASHBOARD_SUBDOMAIN: 'my', PORT: CLOUDRON ? 3000 : 5454, + GRAPHITE_PORT: 2003, INTERNAL_SMTP_PORT: 2525, // this value comes from the mail container AUTHWALL_PORT: 3001, LDAP_PORT: 3002, diff --git a/src/cron.js b/src/cron.js index 2d264b90c..f80600803 100644 --- a/src/cron.js +++ b/src/cron.js @@ -26,6 +26,7 @@ const appHealthMonitor = require('./apphealthmonitor.js'), AuditSource = require('./auditsource.js'), backups = require('./backups.js'), cloudron = require('./cloudron.js'), + collectd = require('./collectd.js'), constants = require('./constants.js'), { CronJob } = require('cron'), debug = require('debug')('box:cron'), @@ -64,7 +65,8 @@ const gJobs = { appHealthMonitor: null, diskUsage: null, externalLdapSyncer: null, - checkDomainConfigs: null + checkDomainConfigs: null, + collectStats: null }; // cron format @@ -190,6 +192,12 @@ async function startJobs() { start: true }); + gJobs.collectStats = CronJob.from({ + cronTime: '*/20 * * * * *', // every 20 seconds. if you change this, change carbon config + onTick: async () => await safe(collectd.sendMetricsToGraphite(), { debug }), + start: true + }); + await handleBackupPolicyChanged(await backups.getPolicy()); await handleAutoupdatePatternChanged(await updater.getAutoupdatePattern()); await handleDynamicDnsChanged(await network.getDynamicDns()); diff --git a/src/graphs.js b/src/graphs.js index 8bd16bf64..9e718a6ce 100644 --- a/src/graphs.js +++ b/src/graphs.js @@ -35,19 +35,17 @@ async function getContainerStats(name, fromMinutes, noNullPoints) { const timeBucketSize = fromMinutes > (24 * 60) ? (6*60) : 5; const graphiteUrl = await getGraphiteUrl(); - // https://collectd.org/wiki/index.php/Data_source . the gauge is point in time value. counter is the change of value const targets = [ - `summarize(collectd.localhost.docker-stats-${name}.gauge-cpu-perc, "${timeBucketSize}min", "avg")`, - `summarize(collectd.localhost.docker-stats-${name}.gauge-mem-used, "${timeBucketSize}min", "avg")`, - // `summarize(collectd.localhost.docker-stats-${name}.gauge-mem-max, "${timeBucketSize}min", "avg")`, - `summarize(collectd.localhost.docker-stats-${name}.counter-blockio-read, "${timeBucketSize}min", "sum")`, - `summarize(collectd.localhost.docker-stats-${name}.counter-blockio-write, "${timeBucketSize}min", "sum")`, - `summarize(collectd.localhost.docker-stats-${name}.counter-network-read, "${timeBucketSize}min", "sum")`, - `summarize(collectd.localhost.docker-stats-${name}.counter-network-write, "${timeBucketSize}min", "sum")`, - `summarize(collectd.localhost.docker-stats-${name}.gauge-blockio-read, "${fromMinutes}min", "max")`, - `summarize(collectd.localhost.docker-stats-${name}.gauge-blockio-write, "${fromMinutes}min", "max")`, - `summarize(collectd.localhost.docker-stats-${name}.gauge-network-read, "${fromMinutes}min", "max")`, - `summarize(collectd.localhost.docker-stats-${name}.gauge-network-write, "${fromMinutes}min", "max")`, + `summarize(cloudron.container-${name}.cpu-perc, "${timeBucketSize}min", "avg")`, + `summarize(cloudron.container-${name}.mem-used, "${timeBucketSize}min", "avg")`, + `summarize(cloudron.container-${name}.blockio-read, "${timeBucketSize}min", "sum")`, + `summarize(cloudron.container-${name}.blockio-write, "${timeBucketSize}min", "sum")`, + `summarize(cloudron.container-${name}.network-read, "${timeBucketSize}min", "sum")`, + `summarize(cloudron.container-${name}.network-write, "${timeBucketSize}min", "sum")`, + `summarize(cloudron.container-${name}.blockio-read, "${fromMinutes}min", "max")`, + `summarize(cloudron.container-${name}.blockio-write, "${fromMinutes}min", "max")`, + `summarize(cloudron.container-${name}.network-read, "${fromMinutes}min", "max")`, + `summarize(cloudron.container-${name}.network-write, "${fromMinutes}min", "max")`, ]; const results = []; @@ -61,9 +59,7 @@ async function getContainerStats(name, fromMinutes, noNullPoints) { noNullPoints: !!noNullPoints }; - // the retry() is needed because there is a node/fetch bug that a closed socket is reused when making a request to the same endpoint many times - // https://github.com/nodejs/undici/issues/3492 - const [error, response] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).retry(3).ok(() => true)); + const [error, response] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).ok(() => true)); if (error) throw new BoxError(BoxError.NETWORK_ERROR, error); if (response.status !== 200) throw new BoxError(BoxError.EXTERNAL_ERROR, `Unknown error with ${target}: ${response.status} ${response.text}`); @@ -86,42 +82,63 @@ async function getContainerStats(name, fromMinutes, noNullPoints) { }; } -async function getSystem(fromMinutes, noNullPoints) { +async function getSystemStats(fromMinutes, noNullPoints) { assert.strictEqual(typeof fromMinutes, 'number'); assert.strictEqual(typeof noNullPoints, 'boolean'); const timeBucketSize = fromMinutes > (24 * 60) ? (6*60) : 5; const graphiteUrl = await getGraphiteUrl(); - const cpuQuery = `summarize(sum(collectd.localhost.aggregation-cpu-sum.cpu-system, collectd.localhost.aggregation-cpu-sum.cpu-user), "${timeBucketSize}min", "avg")`; - const memoryQuery = `summarize(collectd.localhost.memory.memory-used, "${timeBucketSize}min", "avg")`; + const targets = [ + `summarize(sum(cloudron.system.cpu-system, cloudron.system.cpu-user), "${timeBucketSize}min", "avg")`, + `summarize(cloudron.system.memory-used, "${timeBucketSize}min", "avg")` + ]; - const query = { - target: [ cpuQuery, memoryQuery ], - format: 'json', - from: `-${fromMinutes}min`, - until: 'now' - }; + const results = []; - const [memCpuError, memCpuResponse] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).ok(() => true)); - if (memCpuError) throw new BoxError(BoxError.NETWORK_ERROR, memCpuError); - if (memCpuResponse.status !== 200) throw new BoxError(BoxError.EXTERNAL_ERROR, `Unknown error: ${memCpuResponse.status} ${memCpuResponse.text}`); + for (const target of targets) { + const query = { + target: target, + format: 'json', + from: `-${fromMinutes}min`, + until: 'now', + noNullPoints: !!noNullPoints + }; - const appResponses = {}; - for (const app of await apps.list()) { - appResponses[app.id] = await getContainerStats(app.id, fromMinutes, noNullPoints); - } + const [error, response] = await safe(superagent.get(graphiteUrl).query(query).timeout(30 * 1000).ok(() => true)); + if (error) throw new BoxError(BoxError.NETWORK_ERROR, error); + if (response.status !== 200) throw new BoxError(BoxError.EXTERNAL_ERROR, `Unknown error with ${target}: ${response.status} ${response.text}`); - const serviceResponses = {}; - for (const serviceId of await services.listServices()) { - serviceResponses[serviceId] = await getContainerStats(serviceId, fromMinutes, noNullPoints); + results.push(response.body[0] && response.body[0].datapoints ? response.body[0].datapoints : []); } return { - cpu: memCpuResponse.body[0] && memCpuResponse.body[0].datapoints ? memCpuResponse.body[0].datapoints : [], - memory: memCpuResponse.body[1] && memCpuResponse.body[1].datapoints ? memCpuResponse.body[1].datapoints : [], - apps: appResponses, - services: serviceResponses, + cpu: results[0], + memory: results[1] + }; +} + +async function getSystem(fromMinutes, noNullPoints) { + assert.strictEqual(typeof fromMinutes, 'number'); + assert.strictEqual(typeof noNullPoints, 'boolean'); + + const systemStats = await getSystemStats(fromMinutes, noNullPoints); + + const appStats = {}; + for (const app of await apps.list()) { + appStats[app.id] = await getContainerStats(app.id, fromMinutes, noNullPoints); + } + + const serviceStats = {}; + for (const serviceId of await services.listServices()) { + serviceStats[serviceId] = await getContainerStats(serviceId, fromMinutes, noNullPoints); + } + + return { + cpu: systemStats.cpu, + memory: systemStats.memory, + apps: appStats, + services: serviceStats, cpuCount: os.cpus().length }; } diff --git a/src/infra_version.js b/src/infra_version.js index 637747cfb..41dc437cb 100644 --- a/src/infra_version.js +++ b/src/infra_version.js @@ -12,7 +12,7 @@ exports = module.exports = { // docker inspect --format='{{index .RepoDigests 0}}' $IMAGE to get the sha256 . note this has registry in it because manifest id is registry specific! 'images': { // 'base': 'registry.docker.com/cloudron/base:5.0.0@sha256:04fd70dbd8ad6149c19de39e35718e024417c3e01dc9c6637eaf4a41ec4e596c', - 'graphite': 'registry.docker.com/cloudron/graphite:3.5.0@sha256:ee7c9dc49a6507cb3e3cee25495b2044908feb91dac5df87a9633dea38fdeb8a', + 'graphite': 'registry.docker.com/cloudron/graphite:3.5.1@sha256:5383f694245f25a386140268b490a41aa0ba6fb0024d92852546e40c8458681f', 'mail': 'registry.docker.com/cloudron/mail:3.16.4@sha256:468239e1f7a9dc2cdf66750e66b83f1c561048fdd88ce7110fac89a5f7fb8777', 'mongodb': 'registry.docker.com/cloudron/mongodb:6.1.1@sha256:dd5db2d17c1232e1f8ef5528c86c20b70ba654b199d94de44d22afb17aac00aa', 'mysql': 'registry.docker.com/cloudron/mysql:3.5.2@sha256:5cf52069a5ffb126afcaf6cdf91dba7e2c719efe48669e46b616979ef825e25b',