From 2b3c1f8fb36dcb3c197f8c74b6cf0da975a30bf4 Mon Sep 17 00:00:00 2001 From: Quanah Gibson-Mount Date: Fri, 12 Dec 2003 00:07:56 +0000 Subject: [PATCH] check_bos command for AFS servers --- check_bos | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 check_bos diff --git a/check_bos b/check_bos new file mode 100755 index 0000000..ddbcbf5 --- /dev/null +++ b/check_bos @@ -0,0 +1,112 @@ +#!/usr/local/bin/perl -w +$ID = q$Id$; +# +# bos.monitor -- Watch for AFS problems by monitoring bos output. +# +# Written by Russ Allbery +# Based on an earlier script by Neil Crellin +# +# Given a list of AFS servers on the command line, runs bos status on each +# one. Checks to see if there is a communication failure, and also checks to +# see if anything has changed in the bos status output from the previous time +# that that server was checked. If either of these conditions are true, +# print that information to STDOUT. Suitable for being run inside mon. + +############################################################################## +# Site configuration +############################################################################## + +# This script maintains various files containing the old bos status output and +# flag files indicating whether communication failures have already been +# reported. These are all saved in a tree rooted here. +$STATUSDIR = '/usr/local/nagios/status/bos'; + +############################################################################## +# Modules and declarations +############################################################################## + +require 5.005; + +use strict; +use vars qw($ID $STATUSDIR); + +############################################################################## +# Implementation +############################################################################## + +# mon passes the list of servers to check on the command line. +my (@fail, @failclear, @diff); +for my $server (@ARGV) { + rename ("$STATUSDIR/$server", "$STATUSDIR/last/$server"); + + my $failure = 0; + open (BOS, "/usr/local/bin/bos status $server -noauth -long 2>&1 |") + or die "Cannot fork bos status for $server: $!\n"; + open (OUT, "> $STATUSDIR/$server") + or die "Cannot create $STATUSDIR/$server: $!\n"; + while () { + $failure = 1 if (/bos: failed to contact host\'s bosserver /); + print OUT; + } + close BOS; + close OUT; + + # Only report communication failures once, since we may have intended to + # take the machine off-line. Therefore, when reporting a communications + # failure, touch a marker file, and only report communication failures if + # that marker file doesn't exist. Report clearing of communication + # failures (contactable machine, marker file exists) as well. + if (!$failure && -e "$STATUSDIR/$server.commfail") { + unlink ("$STATUSDIR/$server.commfail") + or die "Cannot delete $STATUSDIR/$server.commfail: $!\n"; + push (@failclear, $server); + } elsif ($failure && !-e "$STATUSDIR/$server.commfail") { + open (TOUCH, "> $STATUSDIR/$server.commfail") + or die "Cannot create $STATUSDIR/$server.commfail: $!\n"; + close TOUCH; + push (@fail, $server); + } elsif (-e "$STATUSDIR/last/$server") { + my $current = "$STATUSDIR/$server"; + my $old = "$STATUSDIR/last/$server"; + open (DIFF, "/usr/bin/diff $old $current |") + or die "Cannot fork diff: $!\n"; + my @output = ; + close DIFF; + if (@output > 0) { + push (@diff, [ $server, @output ]); + } + } +} + +# If any of @fail, @failclear, or @diff are non-empty, we have something +# interesting to report. Actually explain what's going on in our output so +# that the mon alert is hopefully readable. (On the other hand, it also is +# going to a pager, so be succinct.) +if (!@fail && !@failclear && !@diff) { + print "Bos OK\n"; + exit 0; +} +my $summary = ''; +if (@fail) { + $summary = "@fail fail"; +} +if (@failclear) { + $summary .= ', ' if $summary; + $summary .= "@failclear clear"; +} +if (@diff) { + $summary .= ', ', if $summary; + $summary .= join (' ', map { $$_[0] } @diff) . ' change'; +} +print "$summary\n"; +if (@fail) { + print "Communication failure for:\n\n", join ("\n", @fail), "\n\n"; +} +if (@failclear) { + print "Failure cleared for:\n\n", join ("\n", @failclear), "\n\n"; +} +for (@diff) { + my ($server, @diff) = @$_; + print "$server bos status changed:\n\n", @diff, "\n"; +} +exit 2; -- 2.39.5