Files
cloudron-box/scripts/cloudron-support
2024-06-13 18:51:11 +02:00

583 lines
21 KiB
Bash
Executable File

#!/bin/bash
set -eu -o pipefail
# scripts requires root
if [[ ${EUID} -ne 0 ]]; then
echo "This script should be run as root. Run with sudo"
exit 1
fi
readonly RED='\033[31m'
readonly GREEN='\033[32m'
readonly YELLOW='\033[33m'
readonly DONE='\033[m'
readonly PASTEBIN="https://paste.cloudron.io"
readonly LINE="\n========================================================\n"
readonly HELP_MESSAGE="
Cloudron Support and Diagnostics Tool
Options:
--disable-dnssec Disable DNSSEC
--enable-remote-access Enable SSH Remote Access for the Cloudron support team
--recreate-containers Deletes all existing containers and recreates them without loss of data
--recreate-docker Deletes docker storage (containers and images) and recreates it without loss of data
--send-diagnostics Collects server diagnostics and uploads it to ${PASTEBIN}
--troubleshoot Dashboard down? Run tests to identify the potential problem
--owner-login Login as owner
--use-external-dns Forwards all DNS requests to Google (8.8.8.8) and Cloudflare (1.1.1.1) DNS servers
--help Show this message
"
function success() {
echo -e "[${GREEN}OK${DONE}]\t${1}"
}
function info() {
echo -e "\t${1}"
}
function warn() {
echo -e "[${YELLOW}WARN${DONE}]\t${1}"
}
function fail() {
echo -e "[${RED}FAIL${DONE}]\t${1}" >&2
}
function enable_remote_access() {
local -r cloudron_support_public_key="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGWS+930b8QdzbchGljt3KSljH9wRhYvht8srrtQHdzg support@cloudron.io"
local -r ssh_user="cloudron-support"
local -r keys_file="/home/cloudron-support/.ssh/authorized_keys"
echo -n "Enabling Remote Access for the Cloudron support team..."
mkdir -p $(dirname "${keys_file}") # .ssh does not exist sometimes
touch "${keys_file}" # required for concat to work
if ! grep -q "${cloudron_support_public_key}" "${keys_file}"; then
echo -e "\n${cloudron_support_public_key}" >> "${keys_file}"
chmod 600 "${keys_file}"
chown "${ssh_user}" "${keys_file}"
fi
echo "Done"
}
function wait_systemd_service() {
local -r service="$1"
for i in {1..3}; do
ts=$(systemctl show "${service}" -p ActiveEnterTimestamp | sed 's/ActiveEnterTimestamp=//g')
start=$(date '+%s' --date="${ts}")
now=$(date '+%s')
up_time=$(( $now - $start ))
(( up_time > 10 )) && return 0
info "Service '${service}' just started $up_time secs ago, checking health again in 10s"
sleep 11
done
return 1
}
function check_host_mysql() {
if ! systemctl is-active -q mysql; then
info "MySQL is down. Trying to restart MySQL ..."
systemctl restart mysql
if ! systemctl is-active -q mysql; then
fail "MySQL is still down, please investigate the error by inspecting /var/log/mysql/error.log"
exit 1
fi
fi
if ! wait_systemd_service mysql; then
fail "MySQL keeps restarting, please investigate the error by inspecting /var/log/mysql/error.log"
exit 1
fi
success "MySQL is running"
}
function check_box() {
[[ -f /home/yellowtent/box/VERSION ]] && version=$(cat /home/yellowtent/box/VERSION) || version='<unknown>'
if ! systemctl is-active -q box; then
info "box v${version} is down. re-running migration script and restarting it ..."
/home/yellowtent/box/setup/start.sh
systemctl stop box # a restart sometimes doesn't restart, no idea
systemctl start box
if ! systemctl is-active -q box; then
fail "box service is still down, please investigate the error by inspecting /home/yellowtent/platformdata/logs/box.log"
exit 1
fi
fi
if ! wait_systemd_service box; then
fail "box service keeps restarting, please investigate the error by inspecting /home/yellowtent/platformdata/logs/box.log"
exit 1
fi
success "box v${version} is running"
}
function owner_login() {
check_host_mysql >/dev/null
local -r owner_username=$(mysql -NB -uroot -ppassword -e "SELECT username FROM box.users WHERE role='owner' AND username IS NOT NULL AND active=1 ORDER BY creationTime LIMIT 1" 2>/dev/null)
local -r owner_password=$(pwgen -1s 12)
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
mysql -NB -uroot -ppassword -e "INSERT INTO box.settings (name, value) VALUES ('ghosts_config', '{\"${owner_username}\":\"${owner_password}\"}') ON DUPLICATE KEY UPDATE name='ghosts_config', value='{\"${owner_username}\":\"${owner_password}\"}'" 2>/dev/null
echo "Login at https://my.${dashboard_domain} as ${owner_username} / ${owner_password} . This password may only be used once."
}
function send_diagnostics() {
local -r log="/tmp/cloudron-support.log"
echo -n "Generating Cloudron Support stats..."
rm -rf $log
echo -e $LINE"Linux"$LINE >> $log
uname -nar &>> $log
echo -e $LINE"Ubuntu"$LINE >> $log
lsb_release -a &>> $log
echo -e $LINE"Cloudron"$LINE >> $log
cloudron_version=$(cat /home/yellowtent/box/VERSION || true)
echo -e "Cloudron version: ${cloudron_version}" >> $log
dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null || true)
echo -e "Dashboard domain: ${dashboard_domain}" >> $log
echo -e $LINE"Docker"$LINE >> $log
if ! timeout --kill-after 10s 15s docker system info &>> $log 2>&1; then
echo -e "Docker (system info) is not responding" >> $log
fi
if ! timeout --kill-after 10s 15s docker ps -a &>> $log 2>&1; then
echo -e "Docker (ps) is not responding" >> $log
fi
echo -e $LINE"Filesystem stats"$LINE >> $log
if ! timeout --kill-after 10s 15s df -h &>> $log 2>&1; then
echo -e "df is not responding" >> $log
fi
echo -e $LINE"Appsdata stats"$LINE >> $log
du -hcsL /home/yellowtent/appsdata/* &>> $log || true
echo -e $LINE"Boxdata stats"$LINE >> $log
du -hcsL /home/yellowtent/boxdata/* &>> $log
echo -e $LINE"Backup stats (possibly misleading)"$LINE >> $log
du -hcsL /var/backups/* &>> $log || true
echo -e $LINE"System daemon status"$LINE >> $log
systemctl status --lines=100 box mysql unbound cloudron-syslog nginx collectd docker &>> $log
echo -e $LINE"Box logs"$LINE >> $log
tail -n 100 /home/yellowtent/platformdata/logs/box.log &>> $log
echo -e $LINE"Interface Info"$LINE >> $log
ip addr &>> $log
echo -e $LINE"Firewall chains"$LINE >> $log
iptables -L &>> $log
has_ipv6=$(cat /proc/net/if_inet6 >/dev/null 2>&1 && echo "yes" || echo "no")
echo -e "IPv6: ${has_ipv6}" >> $log
[[ "${has_ipv6}" == "yes" ]] && ip6tables -L &>> $log
echo "Done"
echo -n "Uploading information..."
paste_key=$(curl -X POST ${PASTEBIN}/documents --silent --data-binary "@$log" | python3 -c "import sys, json; print(json.load(sys.stdin)['key'])")
echo "Done"
echo -e "\nPlease email the following link to support@cloudron.io : ${PASTEBIN}/${paste_key}"
}
function check_unbound() {
if ! systemctl is-active -q unbound; then
info "unbound is down. updating root anchor to see if it fixes it"
unbound-anchor -a /var/lib/unbound/root.key
systemctl restart unbound
if ! systemctl is-active -q unbound; then
fail "unbound is still down, please investigate the error using 'journalctl -u unbound'"
exit 1
fi
fi
if ! wait_systemd_service unbound; then
fail "unbound service keeps restarting, please investigate the error using 'journalctl -u unbound'"
exit 1
fi
test_resolve=$(dig cloudron.io @127.0.0.1 +short)
if [[ -z "test_resolve" ]]; then
fail "DNS is not resolving, maybe try forwarding all DNS requests using the --use-external-dns option"
exit 1
fi
success "unbound is running"
}
function check_dashboard_cert() {
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
local -r nginx_conf_file="/home/yellowtent/platformdata/nginx/applications/dashboard/my.${dashboard_domain}.conf"
local -r cert_file=$(sed -n -e 's/.*ssl_certificate [[:space:]]\+\(.*\);/\1/p' "${nginx_conf_file}")
local -r cert_expiry_date=$(openssl x509 -enddate -noout -in "${cert_file}" | sed -e 's/notAfter=//')
if ! openssl x509 -checkend 100 -noout -in "${cert_file}" >/dev/null 2>&1; then
fail "Certificate has expired. Certificate expired at ${cert_expiry_date}"
local -r task_id=$(mysql -NB -uroot -ppassword -e "SELECT id FROM box.tasks WHERE type='checkCerts' ORDER BY id DESC LIMIT 1" 2>/dev/null)
echo -e "\tPlease check /home/yellowtent/platformdata/logs/tasks/${task_id}.log for last cert renewal logs"
echo -e "\tCommon issues include expiry of domain's API key OR incoming http port 80 not being open"
exit 1
fi
}
function check_nginx() {
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
if ! systemctl is-active -q nginx; then
fail "nginx is down. Removing extraneous dashboard domain configs ..."
# we had a bug where old dashboard domain config file was kept around
cd /home/yellowtent/platformdata/nginx/applications/dashboard/ && find . ! -name "my.${dashboard_domain}.conf" -type f -exec rm -f {} +
# check if certificates are there. nginx will still start if certs are expired
# IFS= makes sure it doesn't trim leading and trailing whitespace
# -r prevents interpretation of \ escapes.
find /home/yellowtent/platformdata/nginx -type f -name '*.conf' -print0 | while IFS= read -r -d '' conf; do
cert_file=$(sed -ne 's/[[:blank:]]\+ssl_certificate[[:blank:]]\+\(.*\);/\1/p' "${conf}")
key_file=$(sed -ne 's/[[:blank:]]\+ssl_certificate_key[[:blank:]]\+\(.*\);/\1/p' "${conf}")
if [[ -n "${cert_file}" && ! -f "${cert_file}" ]]; then
info "${cert_file} does not exist. removing ${conf}"
rm -f "${conf}"
fi
if [[ -n "${key_file}" && ! -f "${key_file}" ]]; then
info "${key_file} does not exist. removing ${conf}"
rm -f "${conf}"
fi
done
systemctl restart nginx
if ! systemctl is-active -q nginx; then
fail "nginx is still down, please investigate the error by inspecting 'journalctl -u nginx' and /var/log/nginx/error.log"
exit 1
fi
fi
if ! wait_systemd_service nginx; then
fail "nginx service keeps restarting, please investigate the error using 'journalctl -u nginx' and /var/log/nginx/error.log"
exit 1
fi
success "nginx is running"
}
# this confirms that https works properly without any proxy (cloudflare) involved
function check_dashboard_site_loopback() {
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
if ! curl --fail -s --resolve "my.${dashboard_domain}:443:127.0.0.1" "https://my.${dashboard_domain}" >/dev/null; then
fail "Could not load dashboard website with loopback check"
exit 1
fi
}
function check_node() {
expected_node_version="$(sed -ne 's/readonly node_version=\(.*\)/\1/p' /home/yellowtent/box/scripts/installer.sh)"
current_node_version="$(node --version | tr -d '\n' | cut -c2-)" # strip trailing newline and 'v' prefix
if [[ "${current_node_version}" != "${expected_node_version}" ]]; then
fail "node version is incorrect. Expecting ${expected_node_version}. Got ${current_node_version}."
echo "You can try the following to fix the problem:"
echo " ln -sf /usr/local/node-${expected_node_version}/bin/node /usr/bin/node"
echo " ln -sf /usr/local/node-${expected_node_version}/bin/npm /usr/bin/npm"
echo " systemctl restart box"
exit 1
fi
success "node version is correct"
}
function check_docker() {
if ! systemctl is-active -q docker; then
info "Docker is down. Trying to restart docker ..."
systemctl restart docker
if ! systemctl is-active -q docker; then
fail "Docker is still down, please investigate the error using 'journalctl -u docker'"
exit 1
fi
fi
if ! wait_systemd_service docker; then
fail "Docker keeps restarting, please investigate the error using 'journalctl -u docker'"
exit 1
fi
success "docker is running"
}
function check_node() {
expected_node_version="$(sed -ne 's/readonly node_version=\(.*\)/\1/p' /home/yellowtent/box/scripts/installer.sh)"
if command -v node &> /dev/null; then
current_node_version="$(node --version | tr -d '\n' | cut -c2-)" # strip trailing newline and 'v' prefix
else
current_node_version="<not found>"
fi
if [[ "${current_node_version}" != "${expected_node_version}" ]]; then
fail "node version is incorrect. Expecting ${expected_node_version}. Got ${current_node_version}."
echo "You can try the following to fix the problem:"
echo " ln -sf /usr/local/node-${expected_node_version}/bin/node /usr/bin/node"
echo " ln -sf /usr/local/node-${expected_node_version}/bin/npm /usr/bin/npm"
echo " systemctl restart box"
exit 1
fi
success "node version is correct"
}
function check_dashboard_site_domain() {
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
local -r domain_provider=$(mysql -NB -uroot -ppassword -e "SELECT provider FROM box.domains WHERE domain='${dashboard_domain}'" 2>/dev/null)
# TODO: check ipv4 and ipv6
if ! output=$(curl --fail -s https://my.${dashboard_domain}); then
fail "Could not load dashboard domain."
if [[ "${domain_provider}" == "cloudflare" ]]; then
echo "Maybe cloudflare proxying is not working. Delete the domain in Cloudflare dashboard and re-add it. This sometimes re-establishes the proxying"
else
echo "Hairpin NAT is not working. Please check if your router supports it"
fi
exit 1
fi
if ! echo $output | grep -q "Cloudron Dashboard"; then
fail "https://my.${dashboard_domain} is not the dashboard domain. Check if DNS is set properly to this server"
host my.${dashboard_domain} 127.0.0.1 # could also result in cloudflare
exit 1
fi
success "Dashboard is reachable via domain name"
}
function check_expired_domain() {
local -r dashboard_domain=$(mysql -NB -uroot -ppassword -e "SELECT value FROM box.settings WHERE name='dashboard_domain'" 2>/dev/null)
if ! command -v whois &> /dev/null; then
info "Domain ${dashboard_domain} expiry check skipped because whois is not installed. Run 'apt install whois' to check"
exit 0
fi
local -r expdate=$(whois ${dashboard_domain} | egrep -i 'Expiration Date:|Expires on|Expiry Date:' | head -1 | awk '{print $NF}')
if [[ -z "${expdate}" ]]; then
warn "Domain ${dashboard_domain} expiry check skipped because whois does not have this information"
exit 0
fi
local -r expdate_secs=$(date -d"$expdate" +%s)
local -r curdate_secs="$(date +%s)"
if (( curdate_secs > expdate_secs )); then
fail "Domain ${dashboard_domain} appears to be expired"
exit 1
fi
success "Domain ${dashboard_domain} is valid and has not expired"
}
function use_external_dns() {
local -r conf_file="/etc/unbound/unbound.conf.d/forward-everything.conf"
info "To remove the forwarding, please delete $conf_file and 'systemctl restart unbound'"
cat > $conf_file <<EOF
forward-zone:
name: "."
forward-addr: 1.1.1.1
forward-addr: 8.8.8.8
EOF
systemctl restart unbound
success "Forwarded all DNS requests to Google (8.8.8.8) & Cloudflare DNS (1.1.1.1)"
}
function disable_dnssec() {
local -r conf_file="/etc/unbound/unbound.conf.d/disable-dnssec.conf"
warn "To reenable DNSSEC, please delete $conf_file and 'systemctl restart unbound'"
cat > $conf_file <<EOF
server:
val-permissive-mode: yes
EOF
systemctl restart unbound
success "DNSSEC Disabled"
}
function troubleshoot() {
# note: disk space test has already been run globally
check_node
check_docker
check_host_mysql
check_nginx # requires mysql to be checked
check_dashboard_site_loopback # checks website via loopback
check_box
check_unbound
check_dashboard_cert
check_dashboard_site_domain # check website via domain name
check_expired_domain
}
function check_disk_space() {
# check if at least 10mb root partition space is available
if [[ "`df --output="avail" / | sed -n 2p`" -lt "10240" ]]; then
echo "No more space left on /"
echo "This is likely the root case of the issue. Free up some space and also check other partitions below:"
echo ""
df -h
echo ""
echo "To recover from a full disk, follow the guide at https://docs.cloudron.io/troubleshooting/#recovery-after-disk-full"
exit 1
fi
# check for at least 5mb free /tmp space for the log file
if [[ "`df --output="avail" /tmp | sed -n 2p`" -lt "5120" ]]; then
echo "Not enough space left on /tmp"
echo "Free up some space first by deleting files from /tmp"
exit 1
fi
}
function recreate_containers() {
readonly logfile="/home/yellowtent/platformdata/logs/box.log"
echo "This will re-create all the containers. Apps will go down for a while. No data will be lost."
read -p "Do you want to proceed? (y/N) " -n 1 -r choice
echo
if [[ ! $choice =~ ^[Yy]$ ]]; then
exit 1
fi
echo ""
info "Follow re-create logs in a second terminal with:"
info "$ tail -f ${logfile}"
echo ""
echo -n "Re-creating addon containers (this takes a while) ."
line_count=$(cat "${logfile}" | wc -l)
sed -e 's/"version": ".*",/"version":"48.0.0",/' -i /home/yellowtent/platformdata/INFRA_VERSION
systemctl restart box
while ! tail -n "+${line_count}" "${logfile}" | grep -q "platform is ready"; do
echo -n "."
sleep 2
done
echo ""
echo "Done! Addon containers successfully re-created. The apps in the dashboard will say 'Configuring (Queued)'. They will come up in a short while."
}
function recreate_docker() {
readonly logfile="/home/yellowtent/platformdata/logs/box.log"
docker_root=/var/lib/docker
if ! docker_root=$(docker info -f '{{ .DockerRootDir }}'); then
warning "Unable to detect docker root. Assuming /var/lib/docker"
fi
echo -e "Use this command when docker storage (at $docker_root) is corrupt. It will delete the docker storage, re-download docker images and re-create containers. Dashboard and apps will be unreachable for a while. No data will be lost.\n"
read -p "Do you want to proceed? (y/N) " -n 1 -r choice
echo -e "\n"
if [[ ! $choice =~ ^[Yy]$ ]]; then
exit 1
fi
info "Stopping box"
systemctl disable box || true
systemctl stop -q box || true
info "Stopping docker"
systemctl disable -q docker || true # for the reboot situation, we don't want it start again
systemctl stop -q docker || true
info "Clearing docker storage at ${docker_root}"
if ! rm -rf "${docker_root}/"*; then
fail "Could not delete storage directory. This can happen because of stray containers that docker has lost track of. To fix this, reboot the server and run this command again"
exit 1
fi
info "Cleared docker storage"
info "Starting docker afresh"
systemctl enable -q docker
systemctl start -q docker
echo ""
info "Follow re-create logs in a second terminal with:"
info "$ tail -f ${logfile}"
echo ""
echo -n "Re-downloading images and re-creating addon containers (this takes a while) ."
line_count=$(cat "${logfile}" | wc -l)
sed -e 's/"version": ".*",/"version":"48.0.0",/' -i /home/yellowtent/platformdata/INFRA_VERSION
systemctl enable box
systemctl restart box # this will create docker network as well
while ! tail -n "+${line_count}" "${logfile}" | grep -q "platform is ready"; do
echo -n "."
sleep 2
done
echo ""
echo "Done! Addon containers successfully re-created. The apps in the dashboard will say 'Configuring (Queued)'. They will come up in a short while."
}
check_disk_space
args=$(getopt -o "" -l "admin-login,disable-dnssec,enable-ssh,enable-remote-access,help,owner-login,recreate-containers,recreate-docker,send-diagnostics,use-external-dns,troubleshoot" -n "$0" -- "$@")
eval set -- "${args}"
while true; do
case "$1" in
--enable-ssh)
# fall through
;&
--enable-remote-access) enable_remote_access; exit 0;;
--admin-login)
# fall through
;&
--owner-login) owner_login; exit 0;;
--send-diagnostics) send_diagnostics; exit 0;;
--troubleshoot) troubleshoot; exit 0;;
--disable-dnssec) disable_dnssec; exit 0;;
--use-external-dns) use_external_dns; exit 0;;
--recreate-containers) recreate_containers; exit 0;;
--recreate-docker) recreate_docker; exit 0;;
--help) break;;
--) break;;
*) echo "Unknown option $1"; exit 1;;
esac
done
echo -e "${HELP_MESSAGE}"