#!/usr/local/bin/perl
# go4check, v1.3
#
#-------------------------------------------------------------------------------
# Introduction
#   go4check checks gopher links, probing each connection and testing the 
#   output received.  It handles most types of links, reporting if the link 
#   is ok, the host serving it is down/refusing connections, or its pathname 
#   has changed.  It is not 100% successful at this, especially when it 
#   comes to gopher0 servers, but does indeed help you keep on top of links
#   in your server(s).
#
#   To run, go4check requires only perl and socket.ph.  It understands
#   gopher0 and gopher+ servers.
#
#   go4check produces a line of output on stdout for each item appearing 
#   in a gopher's menu: the name of the item plus a result.  Indentation 
#   serves to maintain items in context so problems can be located easily.
#   As an extra benefit, go4check's output can be used as a roadmap of
#   the gopher after some rather trivial editing to remove results.
#
#   go4check is written by George A. Theall, George.A.Theall@mail.tju.edu.  
#   You may freely use and redistribute this.  I can not offer any
#   support for this but am interested in your comments, suggestions,
#   and problem reports.
#
#   The latest version is available via gopher as:
#      gopher://tjgopher.tju.edu/00/networks/internet/tools/gopher/go4check
#
#   Note: Version 1.3 will probably be the last version of go4check I release.
#
#-------------------------------------------------------------------------------
# Operation
#   Before you run go4check, make sure perl and the header file socket.ph are
#   available on your system. [You can generate this file by running the perl 
#   utility h2ph on /usr/include/sys/socket.h, or something similar.]
#
#   Invoke go4check with the name of the server to check and an optional port
#   number.  Other options can be used to specify a non-standard starting
#   path or generate copious debugging info.  go4check will test the items
#   listed in the initial menu and recurse into any menus it finds as long
#   as the names of server it finds match the one specified at go4check's
#   invocation. go4check does, though, skip recursion if pathnames refer 
#   to ftp gateways or point back to the initial entry point.
#
#   Results are directed to stdout, so you probably will want to redirect
#   to a file.  You might then remove instances of "...ok.", which
#   indicate no problems and finally search on "...can't connect.",
#   "...path changed.", and "...timed out.".  Another possible result 
#   is "...n/a.", which is used when go4check doesn't know how to check 
#   a particular type of link.
#
#   You may want to tune the variables that go4check uses for testing 
#   items of type 2 and 7.  See below where initial values are defined.
#   For items of type 2, go4check sends a invalid command, which causes
#   many CSO servers to respond in a way that go4check interprets as a
#   success.  As for items of type 7, I don't know of any robust way
#   to test searches.  Currently, the best solution appears to be
#   to search for a word that's common to whatever searches are in the
#   gopher being checked.
#
#   go4check is slow; it probably belongs in a cron job to run at night.
#
#-------------------------------------------------------------------------------
# History
#   15-Mar-95, GAT, v1.3
#	- Added ability to avoid recursing into selected paths.  Paths are
#	  tested using substr() so you can have go4check check a directory
#	  but not recurse further by appending a "/", if that's what you want.
#
#   27-Feb-95, GAT, v1.2
#      - Moved alarm for connecting to within the tcpconnect subroutine
#        to handle better time-out conditions.
#      - Wrapped initial gopher connection and telnet check with alarms.
#
#   31-Jan-95, GAT, v1.1
#      - Alarms are now used to abort connections that are otherwise hung.
#      - Added patches from R.D. Cameron for supporting type 7 items with
#        non-empty paths and checking error returns of type 3.
#      - Fixed glitch that arose on some servers (gopher.uwsp.edu for one)
#        that return lines with non-standard endings.
#      - Explicitly added an assignment for $| and set it to true so output
#        will be flushed after every print.
#
#   17-Oct-94, GAT
#      - Added a semicolon after a line in make_URL.  Its lack appears to
#        cause problems with some versions of Perl.
#
#   01-Sep-94, GAT, v1.0
#      - Released publically.
#
#   10-Aug-94, GAT, v1.0b2
#      - Added $snooze_length as a way to control how long to pause after
#        establishing a connection.
#      - Fixed initialization of %URLs.
#      - Changed format of internal URLs by removing ":" from between type
#        and path info.
#      - Used a configurable word to check search items.
#      - Added check of CSO servers.
#      - Adjusted regular expression used to check success/failure of
#        a link.
#      - Documented go4check's operation.
#
#   12-Jul-94, GAT, v1.0b1
#      - Used pseudo URLs internally for storing links so they are not
#        checked more than once.
#      - Added support for most types of links, including telnet, binary
#        files, and searches.
#      - Used gopher+ protocol whenever possible to avoid retrieving 
#        entire files.
#
#   09-Jun-94, GAT, v1.0a
#      - First version of go4check. Checks only files and directories.
#
#-------------------------------------------------------------------------------


# Specify where perl can find include files.
push(@INC, "/usr/local/lib/perl");


# Define initial values for selected variables.
$| = 1;					# flush after every print?
$default_path2 = "helo";		# for searching type 2 items
$default_search_term = "cancer";	# for searching type 7 items
@excluded_paths = (			# paths to exclude
	"1/tjgopher/changes",
	"1/tju/atrium/",
	"1/tju/nutrition/atrium",
	"1/tju/jeffnews/current",
	"1/tju/jeffnews/backissues/",
	"1/tju/marketing/jeffnews",
	"1/gophers/bylocation/tju",
	"1/gophers/bylocation/philly");
$Indent = "  ";				# indentation at each level
$snooze_length = 3;			# time to snooze before connect
$timeout = 180;				# max len of connect (seconds)
%URLs = ();				# array of URL's on server


# Check for options.
$DEBUG = 0;				# default to no debug
if ($ARGV[0] eq '-d') {
	shift;
	$DEBUG = 1;
}


# Parse commandline args and provide help as needed.
$inithost = shift || "";		# name of host to check
$initport = shift || 70;		# port number
$initpath = shift || "";		# initial directory
if ($inithost eq "" || $inithost eq "-?") {
	print "$0 checks links in a gopher by probing connections\n\n";
	print "Usage:  $0 [-d] host [port] [\"path\"]\n";
	print "        unless specified, port defaults to 70 and path to \"\".\n";
	print "        -d is used for debugging.\n";
	exit(9);
}


# Set up subroutines to catch some alarms.
$SIG{'ALRM'} = handle_Timeout;


# Establish connection and check links.
require 'sys/socket.ph';
chop($thishost = `hostname`);		# needed for tcpconnect
&check_Links($inithost, $initport, $initpath);
exit(0);


########################################################################
#  check_Links - checks links for a given directory.                   #
#                                                                      #
#  Notes:                                                              #
#      - Links on the same host will be followed unless they point to  #
#        the root.  While this will prevent most recursion, there may  #
#        be some gophers with odd setups that lead to infinite loops.  #
#      - FTP links are not followed.                                   #
#  Entry:                                                              #
#        host = hostname                                               #
#        port = port number                                            #
#        path = selector string                                        #
#  Exit:                                                               #
#        New links are appended to @URLs.                              #
########################################################################
sub check_Links {
	local($host, $port, $path) = @_;
	local($margin) = $Indent . $margin;
	local($stat);
	local(@Items);


	# Establish connection and read contents.
	$DEBUG && print "DEBUG: connecting to $host at port $port.\n";
	($GOPHER) = &tcpconnect($host, $thishost);
	if ($@ && $@ =~ /Timed Out/) {
		die "$@";
	}
	($GOPHER) || die "Can't connect";
	$DEBUG && print "DEBUG: sending path \"$path\".\n";
	eval {
		alarm($timeout);
		send($GOPHER, "$path\r\n", 0);
		@Items = <$GOPHER>;
		close($GOPHER);
		alarm(0);
	};
	if ($@ && $@ =~ /Timed Out/) {
		die "$@";
	}


	# Check each item, recursing into directories as necessary.
	foreach (@Items) {
		local($atype, $aname, $apath, $ahost, $aport, $aextra);

		s/\s*$//;		# remove \r\n combo
		last if (/^\.$/);	# done if line is just a period


		# Check status of each unique URL.
		$url = &make_URL($_);
		s/^(.)// && ($atype = $1);
		($aname, $apath, $ahost, $aport, $aextra) = split(/\t/, $_);
		chop($ahost) if ($ahost =~ /\.$/);
		if (defined($URLs{$url})) {	# already checked
			print "$margin$aname...$URLs{$url}.\n";
		}
		else {
			$stat = ($URLs{$url} = &test_URL($url, $aextra));
			print "$margin$aname...$stat.\n";
		}


		# Recurse as necessary.
		if ($stat eq "ok" && 
				$atype == 1 && 
				$ahost eq $inithost && 
				$aport eq $initport && 
				$apath ne "" && 
				&is_Excluded($apath) == 0 &&
				$apath !~ /ftp.*:/) {
			&check_Links($ahost, $aport, $apath);
		}
	}
}


################################################
#  make_URL - constructs a URL from a string.  #
#                                              #
#  Notes:                                      #
#      - The URLs generated here are not 100%  #
#        kosher, only used internally.         #
#                                              #
#  Entry:                                      #
#        string as passed by gopher server.    #
#  Exit:                                       #
#        string representing URL.              #
################################################
sub make_URL {
	local($_) = @_;
	local($url);
	local($type, $name, $path, $host, $port);


	s/^(.)// && ($type = $1);
	($name, $path, $host, $port) = split(/\t/, $_);
	chop($host) if ($host =~ /\.$/);
	if ($type =~ /[01245679sgMhIi]/) {
		$url = "gopher://$host:$port/$type$path";
	}
	elsif ($type =~ /[8T]/) {
		$url = "telnet://";
		$path !~ /^$/ && $url .= "$path@";
		$url .= $host;
		$port > 0 && $url .= ":$port";
		$url .= "/";
	}
	return($url);
}


###########################################################################
#  test_URL - check that a URL is accessible.                             #
#                                                                         #
#  Notes:                                                                 #
#      - I don't have a good way to check gopher0 servers.  Currently, I  #
#        look for the string "error.host", which servers like gn seem to  #
#        generate.  However, this fails with KA9Q, for which an error     #
#        message is indistinguishable from regular text.                  #
#      - For gopher+, a error code indicating a server is too busy is     #
#        treated as an error.  This may not be the right thing to do.     #
#      - If the server understands gopher+, we'll only ask for info (!)   #
#        so as not to retrieve large files.  This approach also seems to  #
#        be the only way to check ASK blocks reliably.                    #
#      - CSO nameservers (type 2) are checked with an invalid command -   #
#        this returns a warning message from the server that is not       #
#        regarded as an error by go4check. Using the command "fields"     #
#        does *not* work since this typically results in lines starting   #
#        with -2, which look like errors.                                 #
#      - Checks of telnet links only see if host is up; no attempt        #
#        is made to login to whatever account may be specified.           #
#      - Checks of FTP links could be improved.  Currently, the info      #
#        returned is not examined beyond looking for the usual signs      #
#        of failure.                                                      #
#  Entry:                                                                 #
#        URL = URL to test                                                #
#        GPLUS = extra character indicating a gopher+ item.               #
#  Exit:                                                                  #
#        Text string indicating status of URL:                            #
#           "ok" = everything ok                                          #
#           "can't connect" = can't connect to host                       #
#           "path changed" = path changed                                 #
#           "n/a" = unknown status                                        #
###########################################################################
sub test_URL {
	local($_, $gplus) = @_;
	local($protocol, $logonid, $host, $port, $type, $path);
	local($1, $2, $3, $4, $5);


	$DEBUG && print "DEBUG: checking $_.\n";
	m#^(\w+)://(.*):(\d+)/?(.?)(.*)#;
	$protocol = $1;
	$host = $2;
	$port = $3;
	$type = $4;
	$path = $5;
	if ($host =~ /@/) {
		($logonid, $host) = split(/@/, $host);
	}
	$DEBUG && print "protocol=$protocol; logonid=$logonid; host=$host; port=$port; type=$type; path=$path.\n";


	# Check gopher links.
	if ($protocol eq "gopher") {
		local($GOPHER);
		local($Stuff);

		$DEBUG && print "DEBUG: checking gopher at $host;$port.\n";
		($GOPHER) = &tcpconnect($host, $thishost);
		if ($@ && $@ =~ /Timed Out/) {
			return "timed out";
		}
		($GOPHER) || return "can't connect";
		$path .= "\t!" if ($gplus);	# Modify selector to get only info
		if ($type eq "2") {
			$path = $default_path2 if ($path =~ /^$/);
		}
		elsif ($type eq "7") {
                # Modification Oct. 19/94 by R.D. Cameron to append
                # handle the nonempty $path case:  to test in this
                # case, we send a tab and the search term after the
                # $path.
			if ($path =~ /^$/) {
				$path = $default_search_term;
			}
			else {
				$path = "$path\t$default_search_term";
			}
			$path =~ s#^waissrc:(.*)/.*$#1$1#;
		}
		$DEBUG && print "DEBUG: sending path \"$path\".\n";
		eval {
			alarm($timeout);
			send($GOPHER, "$path\r\n", 0);
			$Stuff = <$GOPHER>;
			close($GOPHER);
			alarm(0);
		};
		if ($@ && $@ =~ /Timed Out/) {
			return "timed out";
		}
		$DEBUG && print "DEBUG: read \"$Stuff\".\n";


		# Test line for signs of errors.
		#
                # Modification Oct. 19/94 by R.D. Cameron to 
                # check for type 3 error returns when a directory
                # listing is expected.  (According to the gopher 
		# protocol, "3" as the first character of a directory
		# entry always indicates error.
		if ((($type eq "1") | ($type eq "7")) &
		    ($Stuff =~ /^3/)) {
                        return("path changed");
		}
		# Test line for other signs of errors.
		elsif ($Stuff =~ /(^\-\-\d)|(\terror.host\t\d+)/) {
			return("path changed");
		}
		else {
			return("ok");
		}
	}


	# Check telnet links.
	if ($protocol eq "telnet") {
		local($TELNET);

		$DEBUG && print "DEBUG: checking telnet at $host;$port.\n";
		($TELNET) = &tcpconnect($host, $thishost);
		if ($@ && $@ =~ /Timed Out/) {
			return "timed out";
		}
		($TELNET) || return "host down";
		return "ok";
		close($TELNET);
	}


	# If we get here, we don't know how to test the link.	
	return("n/a");
}



#######################################################
#  is_Excluded - checks if a path is to be excluded.  #
#                                                     #
#  Entry:                                             #
#        path to be tested.                           #
#  Exit:                                              #
#        0/1 indicating no/yes.                       #
#######################################################
sub is_Excluded {
	local($path) = @_;


	for (@excluded_paths) {
		if (index($path, $_) >= $[) {
			return(1);
		}
	}
	return(0);
}


################################################################
#  This comes from gopherhunt by Paul Lindner.                 #
#                                                              #
#  I've added a line to abort if it can't resolve an address.  #
#  and return 0 if failure rather than die. GAT                #
#                                                              #
#  I also added an alarm to handle time-out conditions. GAT    #
################################################################
sub tcpconnect {                    #Get TCP info in place
   local($host, $hostname) = @_;
   local($name, $aliases, $type, $len);
   local($thisaddr, $thataddr, $this, $that);
   local($sockaddr);
   $sockaddr = 'S n a4 x8';

   ($name,$aliases,$proto) = getprotobyname('tcp');
   ($name,$aliases,$port) = getservbyname($port, 'tcp')
        unless $port =~ /^\d+$/;
   ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname);
   ($name,$aliases,$type,$len,$thataddr) = gethostbyname($host);
   $name || return(0);

   $this = pack($sockaddr, &AF_INET, 0, $thisaddr);
   $that = pack($sockaddr, &AF_INET, $port, $thataddr);

   sleep($snooze_length);

   eval {
	alarm($timeout);
	socket(N, &PF_INET, &SOCK_STREAM, $proto) || return(0);
	bind(N, $this)                            || return(0);
	connect(N, $that)                         || return(0);
	alarm(0);
   };

   return(N);
}


#####################################################
#  handle_Timeout - Die with a specific message.    #
#                                                   #
#  Notes:                                           #
#        - Calls to alarm() should be in an eval    #
#          block.                                   #
#                                                   #
#  Entry:                                           #
#        n/a                                        #
#  Exit:                                            #
#        Message "Timed Out" is returned.           #
#####################################################
sub handle_Timeout { 
	$DEBUG && print "DEBUG: Timed Out.\n";
	die "Timed Out";
}
