CGI/Perl Guide | Learning Center | Forums | Advertise | Login
Site Search: in

  Main Index MAIN
INDEX
Search Posts SEARCH
POSTS
Who's Online WHO'S
ONLINE
Log in LOG
IN

Home: Perl Programming Help: Intermediate:
not pushing into array

 



phil_jackson
New User

Sep 24, 2010, 4:43 AM

Post #1 of 1 (538 views)
not pushing into array Can't Post

Hi, this is my first post here.


Code
#!/usr/bin/perl 

use strict;
use warnings;
use threads;
use threads::shared;
use URI;
use URI::http;
use File::Basename;
use DBI;
use HTML::Parser;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);

my @urls = ('http://www.actwebdesigns.co.uk', 'http://www.1st4pets.com', 'http://www.stackoverflow.com');
my @threads;

sub resolve_href {
my ($base, $href) = @_;
my $u = URI->new_abs($href, $base);
return $u->canonical;
}
sub redirect_test {
my $url = shift;
my $redirect_limit = 10;
my $y = 0;
my( $response, $responseCode );
while( 1 && $y le $redirect_limit ) {
$response = $ua->get($url);
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
if( $responseCode == 301 || $responseCode == 302 ) {
$url = resolve_href( $url, $response->header('Location') );
}else{
last;
}
}else{
last;
}
$y++;
}
return ($url, $response, $responseCode, $redirect_limit, $y );
}
sub scan {
my $url = shift;
my @hrefs_found;
print "started scanning: $url\n";
my $info = URI::http->new($url);
# if url is not an absolute url
if( ! defined( $info->host ) ) {
print "Invalid URL: $url \n";
}else{
my $host = $info->host;
$host =~ s/^www\.//;
# check to see if url is valid, checks for redirects (max of 10)
my @urlI = redirect_test( $url );
my $content = '';
# checks to see if url did not redirect more than 10 times and that response returned was 200
if( $urlI[4] != $urlI[3] && $urlI[2] == 200 ) {
$content = $urlI[1]->content;
die "get failed: " . $urlI[0] if ( ! defined $content );
}
# sticks all hrefs on a page in an array
my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g );
# foreach links found
foreach( @pageLinksArray ) {
# make href an absolute url
my $url_found = resolve_href( $urlI[0], $_ );
# check if url looks like a valid url
if( $url_found =~ m/^http:\/\// ) {
my $info = URI::http->new($url_found);
# check to see if url is a valid url
if( ! defined( $info->host ) ) {
print "Invalid URL: $url_found \n";
}else{
my %values_index;
@values_index{@hrefs_found} = ();
my %values_index2;
@values_index2{@urls} = ();
# if url is not already been found
if( ! exists $values_index{$url_found} && ! exists $values_index2{$url_found} ) {
# add to arrays
if ( ! push( @hrefs_found, $url_found ) ) {
die "could not push \n";
}
if( ! push( @urls, $url_found ) ) {
die "Could not push\n";
}
}
}
}
}
print "$url found " . scalar @hrefs_found . "\n";

}
return $url;
}

print "Starting main program\n";

while ( @urls ) {
my $url = shift ( @urls );
print "$url\n";
my $t = threads->new(\&scan, $url);
push(@threads,$t);
my $n = 0;
while( 1 ) {
if( scalar @urls == 1 ) {
sleep 10;
}else{
last;
}
if( $n >= 1 ) {
print "IN ARRAY URLS:\n\n";
print @urls;
print "\n\n";
die "Process taking too long.";
last;
}
$n++;
}
}
while (@threads) {
my $url_thread = shift(@threads)->join;
}

is my code. Im testing out threads (as I've never used before and want to get to grips with it. The problem being that it would only go through the 3 startup entries in the @urls array. there are many being pushed into the array but they do apear to be in there... if that makes any sense. Any help is much appreciated.

 
 


Search for (options) Powered by Gossamer Forum v.1.2.0

Web Applications & Managed Hosting Powered by Gossamer Threads
Visit our Mailing List Archives