#!/usr/bin/perl -w # xurl - extract unique, sorted list of links from URL use HTML::LinkExtor; use LWP::Simple; $base_url = shift or die "Usage: $0 URL\n"; # Uses HTML::Parser $parser = HTML::LinkExtor->new(undef, $base_url); $parser->parse(get($base_url)); # The links() method clears the link list, so you can call # it only once per parsed document. It returns a list of lists. # reference with an HTML::Element object at the front # followed by a list of attribute name and attribute value pairs.
# The HTML code: # <A HREF="http://www.perl.org"> # <IMG SRC="images/big.gif" LOWSRC="images/big-lowres.gif"> # </A> # would result in a data structure like this: # ( [ a, href, "http://www.perl.org"], # [ img, src , "images/big.gif", # lowsrc, "images/big-lowres.gif" ] )
@links = $parser->links; foreach $linkarray (@links) { my @element = @$linkarray; my $elt_type = shift @element; while (@element) { my ($attr_name , $attr_value) = splice(@element, 0, 2); # print "$attr_name -> $attr_value\n"; $seen{$attr_value}++; } } foreach (sort keys %seen) { print $_, "\n" }