#!/usr/local/bin/perl # # NAME # http_integrity.monitor # # # SYNOPSIS # http_integrity.monitor [-u url] [-n num_threads] [-a anchor_tag_types] # [-t link_timeout] [-T page_timeout] host... # # # DESCRIPTION # Use try to connect to a http server and verify the integrity of the # page and objects within that page (e.g. to make sure that there are # no broken images). # # For use with "mon". # # # EXAMPLES # ./http_integrity.monitor -u "/index.html" host1 host2 host3 # # # OPTIONS # -u URL path to retrieve from each host. # # -s Use SSL to connect to the host. # # -n Max number of requests to issue at one time. Defaults to 8. # Increasing this number may produce faster load times, depending # on the performance of the site and the speed of the link. # Decreasing this number may produce slower load times, again, # depending on the performance of the site and the speed of the link. # Experiment to find what works best for you. # # -t Timeout, in seconds, to wait for data when downloading any given # link. Must be an integer. # # -T Timeout, in seconds, to issue an error for if the time to load # the page, plus any associated images/applets/etc., exceeds this # number. Can be a floating point number. # # -a Types of anchor tag items to retrieve, in a space separated quoted # list. Default is "img applet". Case matters. # # # AUTHOR # Andrew Ryan # $Id: http_integrity.monitor,v 0.52 2000/04/08 05:18:38 andrewr Exp $ # # # COPYRIGHT # Copyright (C) 2000, Andrew Ryan # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # use strict; use English; use LWP::Parallel::UserAgent; use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; use Time::HiRes qw( gettimeofday tv_interval ); use Getopt::Std; use vars qw/$opt_s $opt_u $opt_n $opt_t $opt_a $opt_T/; getopts ("su:n:t:a:t:T:"); my $url_path = $opt_u || "/"; #default URL path to test is "/" my $max_req = $opt_n || 16; #maximum number of requests to issue at one time my $page_timeout = $opt_T || 10; #timeout, in seconds, for whole page my $item_timeout = $opt_t || int($page_timeout/2); #timeout, in seconds, for any given item my $opt_s = "s" if $opt_s ; my @retrieve_anchors = $opt_a ? split(' ', $opt_a) : ("img","applet"); my @failures = (); my @details = (); my ($host, $p, $url, $tag, %attr, %saw, @addl_links, $base, $res, $req, $entries); my ($total_time, $time_begin, $time_end, $t0, $t1); my ($res_bytes, $total_bytes); my $ua = new LWP::UserAgent; $ua->timeout ($item_timeout); # timeout, in seconds, for base page my $pua = LWP::Parallel::UserAgent->new(); $pua->max_req ($max_req); $pua->timeout ($item_timeout); # timeout, in seconds, for any given request my $exit_status = 0; #default exit status is OK foreach $host (@ARGV) { # Set up a callback that collect image links @addl_links = (); $total_time = 0; $url = "http$opt_s://${host}${url_path}"; # Make the parser. Unfortunately, we don't know the base yet # (it might be diffent from $url) $p = HTML::LinkExtor->new(\&callback); # Request document and parse it as it arrives $t0 = [Time::HiRes::gettimeofday]; $res = $ua->request( HTTP::Request->new(GET => $url) ,sub {$p->parse($_[0])}); $t1 = [Time::HiRes::gettimeofday]; $total_time += Time::HiRes::tv_interval($t0, $t1); if ($res->is_error) { push (@failures, $host); if ( $res->code == 408 ) { push(@details, sprintf("ERROR: Timeout [%s seconds] retrieving %s", $item_timeout, $res->request->url ) ) if $res->is_error; } else { push(@details, sprintf("ERROR: %s [%s] retrieving %s", $res->code, $res->message, $res->request->url ) ) if $res->is_error; } $exit_status++; next; } # Expand all image URLs to absolute ones $base = $res->base; @addl_links = map { $_ = url($_, $base)->abs; } @addl_links; $res_bytes = length($res->as_string); $total_bytes += $res_bytes; # uniq the array of addl_links undef %saw; @saw{@addl_links} = (); @addl_links = keys %saw; foreach (@addl_links) { next if /^https/i; #we don't do https here $req = HTTP::Request->new('GET', "$_"); if ( $res = $pua->register ($req) ) { push(@details, sprintf("ERROR: %s", $res->error_as_HTML) ); $exit_status++; } } $t0 = [Time::HiRes::gettimeofday]; $entries = $pua->wait(); #now retrieve everything $t1 = [Time::HiRes::gettimeofday]; $total_time += Time::HiRes::tv_interval($t0, $t1); foreach (keys %$entries) { $res = $entries->{$_}->response; $res_bytes = length($res->content); $total_bytes += $res_bytes; $exit_status++ if $res->is_error; if ( $res->code == 408 ) { push(@details, sprintf("ERROR: Timeout [%s seconds] retrieving %s", $item_timeout, $res->request->url ) ) if $res->is_error; } else { push(@details, sprintf("ERROR: %s [%s] retrieving %s", $res->code, $res->message, $res->request->url ) ) if $res->is_error; } } if ($total_time > $page_timeout) { push (@failures, $host) ; push (@details, sprintf("ERROR: $url took %.2f seconds for complete load (>%.1f seconds)", $total_time, $page_timeout) ); next; } if ($exit_status > 0) { push (@failures, $host) ; push(@details, sprintf("%s total bytes received in %s objects in %.2f seconds (%.2f bytes/sec)",$total_bytes, scalar(@addl_links)+1, $total_time, $total_bytes/$total_time) ) ; } } if (@failures == 0) { exit 0; } print "@failures\n"; print join("\n", @details); exit $exit_status; sub callback { my($tag, %attr) = @_; foreach (@retrieve_anchors) { push(@addl_links, values %attr) if $tag eq $_; } }