collectd plugin ordering matters. the write_graphite plugin establishes a TCP connection but there is a race between that and the df/du values that get reported. du is especially problematic since we report this only every 12 hours. so, instead we cache the values and report it every 20 seconds. on the carbon side, it will just retain every 12 hours (since that is the whisper retention period). there is also FlushInterval which I am not 100% sure has any effect. by default, the write_graphite plugin waits for 1428 bytes to be accumulated. (https://manpages.debian.org/unstable/collectd-core/collectd.conf.5.en.html) https://github.com/collectd/collectd/issues/2672 https://github.com/collectd/collectd/pull/1044 I found this syntax hidden deep inside https://www.cisco.com/c/en/us/td/docs/net_mgmt/virtual_topology_system/2_6_3/user_guide/Cisco_VTS_2_6_3_User_Guide/Cisco_VTS_2_6_1_User_Guide_chapter_01111.pdf
106 lines
4.0 KiB
Python
106 lines
4.0 KiB
Python
import collectd,os,subprocess,sys,re,time
|
|
|
|
# https://www.programcreek.com/python/example/106897/collectd.register_read
|
|
|
|
PATHS = [] # { name, dir, exclude }
|
|
# there is a pattern in carbon/storage-schemas.conf which stores values every 12h for a year
|
|
INTERVAL = 60 * 60 * 12 # twice a day. change values in docker-graphite if you change this
|
|
|
|
# we used to pass the INTERVAL as a parameter to register_read. however, collectd write_graphite
|
|
# takes a bit to load (tcp connection) and drops the du data. this then means that we have to wait
|
|
# for INTERVAL secs for du data. instead, we just cache the value for INTERVAL instead
|
|
CACHE = dict()
|
|
CACHE_TIME = 0
|
|
|
|
def du(pathinfo):
|
|
# -B1 makes du print block sizes and not apparent sizes (to match df which also uses block sizes)
|
|
dirname = pathinfo['dir']
|
|
cmd = 'timeout 1800 du -DsB1 "{}"'.format(dirname)
|
|
if pathinfo['exclude'] != '':
|
|
cmd += ' --exclude "{}"'.format(pathinfo['exclude'])
|
|
|
|
collectd.info('computing size with command: %s' % cmd);
|
|
try:
|
|
size = subprocess.check_output(cmd, shell=True).split()[0].decode('utf-8')
|
|
collectd.info('\tsize of %s is %s (time: %i)' % (dirname, size, int(time.time())))
|
|
return size
|
|
except Exception as e:
|
|
collectd.info('\terror getting the size of %s: %s' % (dirname, str(e)))
|
|
return 0
|
|
|
|
def parseSize(size):
|
|
units = {"B": 1, "KB": 10**3, "MB": 10**6, "GB": 10**9, "TB": 10**12}
|
|
number, unit, _ = re.split('([a-zA-Z]+)', size.upper())
|
|
return int(float(number)*units[unit])
|
|
|
|
def dockerSize():
|
|
# use --format '{{json .}}' to dump the string. '{{if eq .Type "Images"}}{{.Size}}{{end}}' still creates newlines
|
|
# https://godoc.org/github.com/docker/go-units#HumanSize is used. so it's 1000 (KB) and not 1024 (KiB)
|
|
cmd = 'timeout 1800 docker system df --format "{{.Size}}" | head -n1'
|
|
try:
|
|
size = subprocess.check_output(cmd, shell=True).strip().decode('utf-8')
|
|
collectd.info('size of docker images is %s (%s) (time: %i)' % (size, parseSize(size), int(time.time())))
|
|
return parseSize(size)
|
|
except Exception as e:
|
|
collectd.info('error getting docker images size : %s' % str(e))
|
|
return 0
|
|
|
|
# configure is called for each module block. this is called before init
|
|
def configure(config):
|
|
global PATHS
|
|
|
|
for child in config.children:
|
|
if child.key != 'Path':
|
|
collectd.info('du plugin: Unknown config key "%s"' % key)
|
|
continue
|
|
|
|
pathinfo = { 'name': '', 'dir': '', 'exclude': '' }
|
|
for node in child.children:
|
|
if node.key == 'Instance':
|
|
pathinfo['name'] = node.values[0]
|
|
elif node.key == 'Dir':
|
|
pathinfo['dir'] = node.values[0]
|
|
elif node.key == 'Exclude':
|
|
pathinfo['exclude'] = node.values[0]
|
|
|
|
PATHS.append(pathinfo);
|
|
collectd.info('du plugin: monitoring %s' % pathinfo['dir']);
|
|
|
|
def init():
|
|
global PATHS
|
|
collectd.info('custom du plugin initialized with %s %s' % (PATHS, sys.version))
|
|
|
|
def read():
|
|
global CACHE, CACHE_TIME
|
|
|
|
# read from cache if < 12 hours
|
|
read_cache = (time.time() - CACHE_TIME) < INTERVAL
|
|
|
|
if not read_cache:
|
|
CACHE_TIME = time.time()
|
|
|
|
for pathinfo in PATHS:
|
|
dirname = pathinfo['dir']
|
|
if read_cache and dirname in CACHE:
|
|
size = CACHE[dirname]
|
|
else:
|
|
size = du(pathinfo)
|
|
CACHE[dirname] = size
|
|
|
|
# type comes from https://github.com/collectd/collectd/blob/master/src/types.db
|
|
val = collectd.Values(type='capacity', plugin='du', plugin_instance=pathinfo['name'])
|
|
val.dispatch(values=[size], type_instance='usage')
|
|
|
|
if read_cache and 'docker' in CACHE:
|
|
size = CACHE['docker']
|
|
else:
|
|
size = dockerSize()
|
|
CACHE['docker'] = size
|
|
|
|
val = collectd.Values(type='capacity', plugin='du', plugin_instance='docker')
|
|
val.dispatch(values=[size], type_instance='usage')
|
|
|
|
collectd.register_init(init)
|
|
collectd.register_config(configure)
|
|
collectd.register_read(read)
|