-#!/usr/local/bin/perl
-# $Id$
+#!/usr/bin/perl -w
+$ID = q$Id$;
+#
+# check_rxdebug -- Check AFS servers for blocked connections in Nagios.
+#
+# Written by Quanah Gibson-Mount based on work by Neil Crellin
+# Updated by Russ Allbery <rra@stanford.edu>
+# Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University
+#
+# This program is free software; you may redistribute it and/or modify it
+# under the same terms as Perl itself.
+#
+# Expects a file server with the -H option and runs rxdebug against that file
+# server, looking for any connections that are waiting for a process. Exits
+# with status 1 if there are more than four connections in that state (a
+# warning) and with status 2 if there are more than eight connections in that
+# state. The thresholds can be overridden from the command line.
-use Getopt::Std;
+##############################################################################
+# Site configuration
+##############################################################################
-getopts ("H:");
-$server = $opt_H;
+# The default count of blocked connections at which to warn or send a critical
+# alert. These can be overridden with the -w and -c command-line options.
+$WARNINGS = 4;
+$CRITICAL = 8;
-$rxdebug = '/usr/local/bin/rxdebug';
-@failures=();
-$hiWaterMark=8;
+# The default timeout in seconds (implemented by alarm) for rxdebug.
+$TIMEOUT = 60;
-# Get the output of rxdebug $server.
-open(RXDEBUG, "$rxdebug $server |")
- || die("Can't open rxdebug\n");
-$blocked{$server} = 0;
-while (<RXDEBUG>) {
- if ( /waiting_for_process/ ) {
- $blocked{$server}++;
- }
+# The full path to rxdebug. Make sure that this is on local disk so that
+# monitoring doesn't have an AFS dependency.
+($RXDEBUG) = grep { -x $_ } qw(/usr/bin/rxdebug /usr/local/bin/rxdebug);
+$RXDEBUG ||= '/usr/local/bin/rxdebug';
+
+##############################################################################
+# Modules and declarations
+##############################################################################
+
+require 5.003;
+
+use strict;
+use vars qw($CRITICAL $ID $RXDEBUG $TIMEOUT $WARNINGS);
+
+use Getopt::Long qw(GetOptions);
+
+##############################################################################
+# Implementation
+##############################################################################
+
+# Parse command line options.
+my ($help, $host, $version);
+Getopt::Long::config ('bundling', 'no_ignore_case');
+GetOptions ('critical|c=i' => \$CRITICAL,
+ 'hostname|H=s' => \$host,
+ 'help|h' => \$help,
+ 'timeout|t=i' => \$TIMEOUT,
+ 'version|V' => \$version,
+ 'warning|w=i' => \$WARNINGS) or exit 3;
+if ($help) {
+ print "Feeding myself to perldoc, please wait....\n";
+ exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
+} elsif ($version) {
+ my $version = join (' ', (split (' ', $ID))[1..3]);
+ $version =~ s/,v\b//;
+ $version =~ s/(\S+)$/($1)/;
+ $version =~ tr%/%-%;
+ print $version, "\n";
+ exit 0;
+}
+if (@ARGV) {
+ warn "Usage: $0 [-hv] [-c <level>] [-w <level>] -H <host>\n";
+ exit 3;
+}
+if ($WARNINGS > $CRITICAL) {
+ warn "$0: warning level $WARNINGS greater than critical level $CRITICAL\n";
+ exit 3;
}
-close(RXDEBUG);
-foreach $server (sort keys %blocked) {
- $blocked=$blocked{$server};
- if ($blocked >= $hiWaterMark) {
- push (@failures, "$server blck: $blocked");
- }
+# Set up the alarm.
+$SIG{ALRM} = sub {
+ print "AFS CRITICAL: network timeout after $TIMEOUT seconds\n";
+ exit 2;
+};
+alarm ($TIMEOUT);
+
+# Run rxdebug and parse the output, counting the number of waiting for process
+# connections that we have.
+unless (open (RXDEBUG, "$RXDEBUG $host |")) {
+ warn "$0: cannot run rxdebug\n";
+ exit 3;
+}
+my $blocked = 0;
+while (<RXDEBUG>) {
+ $blocked++ if /waiting_for_process/;
+}
+close RXDEBUG;
+if ($? != 0) {
+ print "AFS CRITICAL: cannot contact server\n";
+ exit 2;
}
-if (@failures == 0) {
- print "rxdebug OK\n";
+# Check the connection count against our limits and make sure that it's okay.
+if ($blocked >= $CRITICAL) {
+ print "AFS CRITICAL: $blocked blocked connections\n";
+ exit 2;
+} elsif ($blocked >= $WARNINGS) {
+ print "AFS WARNING: $blocked blocked connections\n";
+ exit 1;
+} else {
+ print "AFS OK: $blocked blocked connections\n";
exit 0;
}
-print "@failures\n";
-exit 2;
+##############################################################################
+# Documentation
+##############################################################################
+
+=head1 NAME
+
+check_rxdebug - Check AFS servers for blocked connections in Nagios
+
+=head1 SYNOPSIS
+
+check_rxdebug [B<-hv>] [B<-c> I<threshold>] [B<-w> I<threshold>]
+[B<-t> I<timeout>] B<-H> I<host>
+
+=head1 DESCRIPTION
+
+B<check_rxdebug> is a Nagios plugin for checking AFS file servers to see if
+there are client connections waiting for a free thread. If there are more
+than a few of these, AFS performance tends to be very slow; this is a fairly
+reliable way to catch overloaded file servers. By default, B<check_rxdebug>
+returns a critical error if there are more than eight connections waiting
+for a free thread and a warning if there are more than four. These
+thresholds can be changed with the B<-c> and B<-w> options.
+
+B<check_rxdebug> will always print out a single line of output including the
+number of blocked connections, displaying whether this is critical, a
+warning, or okay.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-c> I<threshold>, B<--critical>=I<threshold>
+
+Change the critical blocked connection count threshold to I<threshold>,
+which should be an integer. The default is 8.
+
+=item B<-H> I<host>, B<--hostname>=I<host>
+
+The AFS file server whose connections B<check_rxdebug> should check. This
+option is required.
+
+=item B<-h>, B<--help>
+
+Print out this documentation (which is done simply by feeding the script
+to C<perldoc -t>).
+
+=item B<-t> I<timeout>, B<--timeout>=I<timeout>
+
+Change the timeout for the C<rxdebug> command. The default timeout is 60
+seconds.
+
+=item B<-V>, B<--version>
+
+Print out the version of B<check_rxdebug> and quit.
+
+=item B<-w> I<threshold>, B<--warning>=I<threshold>
+
+Change the warning blocked connection threshold to I<threshold>, which
+should be an integer. The default is 4.
+
+=back
+
+=head1 EXIT STATUS
+
+B<check_rxdebug> follows the standard Nagios exit status requirements. This
+means that it will exit with status 0 if there are no problems, with status
+1 if there is a warning, and with status 2 if there is a critical problem.
+For other errors, such as invalid syntax, B<check_afsspace> will exit with
+status 3.
+
+=head1 BUGS
+
+The standard B<-v> verbose Nagios plugin option is not supported, although
+it's not entirely clear what it would add.
+
+The usage message for invalid options and for the B<-h> option doesn't
+conform to Nagios standards.
+
+=head1 CAVEATS
+
+This script does not use the Nagios util library or any of the defaults that
+it provides, which makes it somewhat deficient as a Nagios plugin. This is
+intentional, though, since this script can be used with other monitoring
+systems as well. It's not clear what a good solution to this would be.
+
+=head1 SEE ALSO
+
+The current version of this and other AFS monitoring plugins for Nagios are
+available from the AFS monitoring tools page at
+L<http://www.eyrie.org/~eagle/software/afs-monitor/>.
+
+=head1 AUTHORS
+
+The original idea behind this script was from Neil Crellin. It was updated
+by Quanah Gibson-Mount to work with Nagios, and then further updated by Russ
+Allbery <rra@stanford.edu> to support more standard options and to use a
+more uniform coding style.
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut