#!/usr/bin/perl # mysql_watchdog.pl Monitor Mysql Daemon for possible lockups. # --------------------------------------------------------------------------- # Author: Yermo Lamers yml@yml.com # # Copyright (c) 1997 Yermo Lamers. All rights reserved. # # Redistribution and use, with or without modification, are permitted # provided that redistributions must retain the above copyright notice # and the following disclaimer: # # This software is provided by Yermo Lamers ``as is'' and any express # or implied warranties, including, but not limited to, the implied # warranties of merchantability and fitness for a particular purpose are # disclaimed. In no event shall Yermo Lamers be liable # for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # --------------------------------------------------------------------------- # PACKAGE: Mysqld Watchdog Script # --------------------------------------------------------------------------- # DESCRIPTION: # # The mysqld daemon is prone to lockups causing queries to hang # indefinitely. # # This script polls mysqld at regular intervals. If the server # does not respond promptly, mysql_watchdog will kill mysqld # safe_mysqld should then notice this event and restart it. # # USAGE: # # ./mysql_watchdog.pl | tee --append /tmp/watchdog.log # # This will allow you to see the output in a window and keep it logged # to a file. # # ASSUMPTIONS: # COMMENTS: # # 1. A log of events is kept in /tmp/mysql_watchdog.log # # 2. If you are having alot of problems with mysqld I find it useful # to leave this script running in a window. # # 3. This script generates ALOT of output over time. # # SEE ALSO: # REVISION: 0.01.2dev # # REVISION HISTORY/COMMENTS: # # 7 Oct 1997 YmL : # . initial revision. # # 10 Oct 1997 YmL: # . now works correctly on systems that implement POSIX restartable # system calls. # . reduced restart wait down to 15 seconds. # # 20 Oct 1997 YmL: # . removed sleep from signal handler, now interrupts system call # correctly. # # -------------------------------------------------------------------------- # # SETTINGS: # # set this to reflect your site. YOU MUST EDIT THIS LINE FOR THIS TO WORK! # path to the mysql pid file $mysqld_pidfilename = "/usr/local/var/YOURPIDFILEHERE"; # some globals - don't edit $reconnect = "no"; # ------------------------------------------------------------------------- use Mysql; # kill_mysqld() # ----------------------------------------------------------------------- # # DESCRIPTION: # # This subroutine is invoked by a SIGALRM, gets the pid of mysqld # and sends it a SIGINT. # # COMMENTS: sub kill_mysqld { my( $signame ) = shift; my( $mysqld_pid, $count ); # log file event print "handler : Mysqld stopped responding at " . localtime() . "\n"; # get the pid of mysqld unless ( open( PIDFILE, $mysqld_pidfilename ) ) { print "Unable to open pid file '$mysqld_pidfilename' - $!\n"; die "Restart"; } print "handler : Reading PID file\n"; $mysqld_pid = ; print "handler : Killing mysqld process '$mysqld_pid'\n"; # site specific, try killing httpd also. - you may want to comment # this out. # system( "killall httpd" ); # kill off the server - the safe_mysqld script should restart the # server once it's terminated. $count = kill( 9, $mysqld_pid ); print "handler : kill returned '$count'\n"; # This subroutine gets called inside an eval() so that it # can interrupt the hung system call. Note that the value here # will be returned to the caller. print "handler : calling die\n"; die "Restart\n"; } # end of kill_mysqld() # ------------------------------------------------------------------------ # main # make sure we can see what's going on by flushing STDOUT $| = 1; print "\nmysql_watchdog.pl started on " . localtime() . "\n"; # wait 30 seconds to let the server come up (assuming we're being # called from safe_mysqld ) sleep( 30 ); # establish a connection to the server - the server may already be # hanging at this point so make sure to wrap this call in # an alarm call. print "Attempting initial connection at " . localtime() . "\n"; # From the Perl 5 book, we wrap the system call inside an eval # so that the hung system call can exit when the signal handler # calls die() - this is one case where system V signal semantics are # nicer IHMO. eval { local $SIG{ ALRM } = \&kill_mysqld; alarm( 15 ); unless( $dbh = Mysql->connect( "localhost" ) ) { alarm( 0 ); # unable to connect to mysqld server. print "Unable to make initial connection to mysql server at " . localtime() . "due to error '" . $dbh->errmsg . "'\n"; # if it's hung kill it off. kill_mysqld("ALRM"); } else { alarm( 0 ); print "Successful initial connection established at " . localtime() . "\n"; $dbh->SelectDB( "mysql" ); } }; # end of initital connection eval. # check to see whether we need to reconnect or not if ( $@ =~ /Restart/ ) { print "Need to attempt a reconnect\n"; $reconnect = "yes"; } # loop trying a show tables every 30 seconds while ( 1 ) { if ( $reconnect eq "yes" ) { # the signal handler has run, so we need to reconnect to the # server - I believe it may be able to hang here. # note again the we have to do this in an eval because POSIX # system calls are restartable. - note the die() in the signal handler print "Attempting to reconnect at " . localtime() . "\n"; eval { local $SIG{ALRM} = \&kill_mysqld; alarm( 15 ); unless ( $dbh = Mysql->connect( "localhost" ) ) { alarm( 0 ); # there was some problem trying to connect to mysqld print "Unable to reconnect to Mysql - '" . $dbh->errmsg . "'"; kill_mysqld("ALRM"); } else { alarm( 0 ); # the connection has been established. $reconnect = "no"; print "Successful reconnection to mysqld at " . localtime() . "\n"; # site specific - restart httpd - you may want to comment this # out # print "Restarting httpd\n"; # system( "/usr/local/sbin/httpd" ); } $dbh->SelectDB( "mysql" ); }; # end of reconnection eval. if ( $@ =~ /Restart/ ) { print "Need to reconnect\n"; $reconnect = "yes"; } } # end of if we had to reconnect to the server # execute a getserverstats() every 30 seconds - if # we don't get a response the server needs to be restarted. # again, we have to do each operation in an eval so we don't # hang indefinitely. eval { local $SIG{ALRM} = \&kill_mysqld; alarm( 15 ); $status = $dbh->getserverstats(); if ( $dbh->errmsg ne "" ) { alarm( 0 ); print "getserverstats() produced error '" . $dbh->errmsg . "'\n"; # the only errors here would require a restart (I believe) kill_mysqld("ALRM"); } alarm( 0 ); }; # end of eval of getserverstats() if ( $@ =~ /Restart/ ) { print "getservstats() was locked. Need to do a reconnect"; $reconnect = "yes"; } print $status . "\n"; # now wait to run another query sleep( 30 ); }