
pjshort42
Novice
Jul 31, 2012, 1:32 AM
Post #6 of 15
(7988 views)
|
Re: [Laurent_R] speeding up clustering program
[In reply to]
|
Can't Post
|
|
I did try that and it ended up just about the same speed, but it is good to start using that style anyways. I think the reason (using the module posted earlier) is that it is the second part of the code (where it takes the output of the quadruple loop and counts matches) that is taking the vast majority of time. here is what I have changed it to as of now:
use List::Util qw(first max maxstr min minstr reduce shuffle sum); my $start = time; print 'Taking some time to read file...', "\n"; open FILE2, "my_file.txt" or die $!; @nodeA = (); @nodeB = (); @contacts = (); while (<FILE2>) { my @columns = split('\t', $_); if (exists $columns[2]) { my $col1 = $columns[0]; my $col2 = $columns[1]; push(@nodeA, $col1); push(@nodeB, $col2); } } %interactions = (); for ($h=0; $h<scalar @nodeA; $h++){ push @{$interactions{$nodeA[$h]}}, $nodeB[$h]; } ##list of unique genes## @combined_list = (@nodeA, @nodeB); %contact_count = (); foreach (@combined_list) { $contact_count{$_}++; } @gene_list = keys %contact_count; #we want to select a particular number of genes and find the number of edges between them# $full_count = 0; $quad_count = 0; $quint_count = 0; my $gene_size = scalar @gene_list -1; for my $i (0..$gene_size){ for my $j ($i+1..$gene_size){ for my $k ($j+1..$gene_size){ for my $l ($k+1..$gene_size){ @to_consider = (); $count = 0; push (@to_consider, $gene_list[$i]); push (@to_consider, $gene_list[$j]); push (@to_consider, $gene_list[$k]); push (@to_consider, $gene_list[$l]); #print $to_consider[0], "\t", $to_consider[1], "\n" for my $o (0..3){ for my $p (0..3){ @grep_add = grep(/$to_consider[$o]/,@{$interactions{$to_consider[$p]}}); $count += scalar @grep_add; @grep_add = (); } } if ($count>2){ $full_count += 1; } if ($count>3){ $quad_count +=1; } if ($count >5){ $quint_count +=1; } $count = 0 } } } } my $end = time; my $elapsed = $end - $start; $triple = $full_count - $quad_count; $real_quad = $quad_count - $quint_count; print "\n", "total time ", $elapsed, " seconds \n"; print 'The total number of cluster groups is ', $full_count, ' with ', $quint_count, ' quintuple groups, ', $real_quad, ' quadruple groups, and ', $triple, ' triple counts.', "\n"; Using the hash and much smaller grep method seems to be working much faster (but still fairly slow!).
|