#!/usr/local/bin/perl -w ## graphsite # # Description : generates a graph of links of sites to other sites # # Usage : ./graphsite [-d|--debug] > links.png # # # # (c)opyright Simon Wistow , 2003 # # Distributed under the same terms as Perl itself # # This software is under no warranty and will probably ruin # your life, kill your friends, burn your house and bring # about the apocalypse # ## # import a load of stuff use constant MAX_SITE_DEPTH => 0; use strict; use GraphViz; use URI; # set up some vars my %seen; my %links; my @sites; my $seed = shift || "http://thegestalt.org/simon"; my $DEBUG = (defined $ARGV[0] && ($ARGV[0] eq '-d' || $ARGV[0] eq '--debug')); # instantiate the thing that's going to draw stuff for us my $g = GraphViz->new(); print STDERR "Examining :\n" if $DEBUG; push @sites, Site->new(URI->new($seed)); # loop through each site while (@sites) { my $site = shift @sites; # if we've seen it or it's too deep then don't bother next if $seen{$site->host()}++; next if $site->depth()>MAX_SITE_DEPTH; print STDERR "\t", $site->host(),"\n" if $DEBUG; # add this as a node on the graph $g->add_node($site->host()); # get all the sites it's linked to my @linked_to = $site->linked_to(); # add this in for later $links{$site->host()} = \@linked_to; # an then we're going to check those later push @sites, @linked_to; } print STDERR "\n\nGenerating links :\n" if $DEBUG; # now examine each node and see what's linked to it foreach my $site (keys %links) { print STDERR "\t$site : " if $DEBUG; # and generate an arc for each one foreach my $link (@{$links{$site}}) { $link = $link->host; print STDERR " $link" if $DEBUG; $g->add_edge($site => $link); } print STDERR "\n" if $DEBUG; } # print it out as a png file print $g->as_png(); package Site; use constant MAX_PAGE_DEPTH => 0; use LWP::Simple; use HTML::SimpleLinkExtor; use URI; sub new { my $class = shift; my $url = shift || die "Must pass a site\n"; my $depth = shift || 0; my $self = { _url => $url, _depth => $depth }; return bless $self, $class; } sub host { my $self = shift; return $self->{_url}->host(); } sub depth { my $self = shift; return $self->{_depth}; } sub linked_to { my $self = shift; my @sites; my @urls; my %seen; # push the original URL onto the list push @urls, [$self->{_url}, 0]; while (@urls) { my $item = shift @urls; my $url = $item->[0]; my $depth = $item->[1]; my $extor = HTML::SimpleLinkExtor->new($url); next if $seen{"$url"}++; # fetch the url my $page = get($url); next unless $page; # extract the links $extor->parse($page); foreach my $link ($extor->a()) { next if $link->scheme ne "http"; # if they're a new site then push them onto the list if ($link->host() ne $self->host) { push @sites, Site->new($link, $self->depth+1); next; } #otherwise, if the depth isn't too great, push them onto urls push @urls, [$link, $depth+1] if ($depth < MAX_PAGE_DEPTH); } } return @sites; }