
phil_jackson
New User
Sep 24, 2010, 4:43 AM
Post #1 of 1
(475 views)
|
|
not pushing into array
|
Can't Post
|
|
Hi, this is my first post here.
#!/usr/bin/perl use strict; use warnings; use threads; use threads::shared; use URI; use URI::http; use File::Basename; use DBI; use HTML::Parser; use LWP::Simple; require LWP::UserAgent; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; $ua->max_redirect(0); my @urls = ('http://www.actwebdesigns.co.uk', 'http://www.1st4pets.com', 'http://www.stackoverflow.com'); my @threads; sub resolve_href { my ($base, $href) = @_; my $u = URI->new_abs($href, $base); return $u->canonical; } sub redirect_test { my $url = shift; my $redirect_limit = 10; my $y = 0; my( $response, $responseCode ); while( 1 && $y le $redirect_limit ) { $response = $ua->get($url); $responseCode = $response->code; if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) { if( $responseCode == 301 || $responseCode == 302 ) { $url = resolve_href( $url, $response->header('Location') ); }else{ last; } }else{ last; } $y++; } return ($url, $response, $responseCode, $redirect_limit, $y ); } sub scan { my $url = shift; my @hrefs_found; print "started scanning: $url\n"; my $info = URI::http->new($url); # if url is not an absolute url if( ! defined( $info->host ) ) { print "Invalid URL: $url \n"; }else{ my $host = $info->host; $host =~ s/^www\.//; # check to see if url is valid, checks for redirects (max of 10) my @urlI = redirect_test( $url ); my $content = ''; # checks to see if url did not redirect more than 10 times and that response returned was 200 if( $urlI[4] != $urlI[3] && $urlI[2] == 200 ) { $content = $urlI[1]->content; die "get failed: " . $urlI[0] if ( ! defined $content ); } # sticks all hrefs on a page in an array my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g ); # foreach links found foreach( @pageLinksArray ) { # make href an absolute url my $url_found = resolve_href( $urlI[0], $_ ); # check if url looks like a valid url if( $url_found =~ m/^http:\/\// ) { my $info = URI::http->new($url_found); # check to see if url is a valid url if( ! defined( $info->host ) ) { print "Invalid URL: $url_found \n"; }else{ my %values_index; @values_index{@hrefs_found} = (); my %values_index2; @values_index2{@urls} = (); # if url is not already been found if( ! exists $values_index{$url_found} && ! exists $values_index2{$url_found} ) { # add to arrays if ( ! push( @hrefs_found, $url_found ) ) { die "could not push \n"; } if( ! push( @urls, $url_found ) ) { die "Could not push\n"; } } } } } print "$url found " . scalar @hrefs_found . "\n"; } return $url; } print "Starting main program\n"; while ( @urls ) { my $url = shift ( @urls ); print "$url\n"; my $t = threads->new(\&scan, $url); push(@threads,$t); my $n = 0; while( 1 ) { if( scalar @urls == 1 ) { sleep 10; }else{ last; } if( $n >= 1 ) { print "IN ARRAY URLS:\n\n"; print @urls; print "\n\n"; die "Process taking too long."; last; } $n++; } } while (@threads) { my $url_thread = shift(@threads)->join; } is my code. Im testing out threads (as I've never used before and want to get to grips with it. The problem being that it would only go through the 3 startup entries in the @urls array. there are many being pushed into the array but they do apear to be in there... if that makes any sense. Any help is much appreciated.
|