CGI/Perl Guide | Learning Center | Forums | Advertise | Login
Site Search: in

  Main Index MAIN
INDEX
Search Posts SEARCH
POSTS
Who's Online WHO'S
ONLINE
Log in LOG
IN

Home: Perl Programming Help: Beginner:
Running into really weird problem

 



kotoroshinoto
New User

Dec 22, 2010, 9:40 AM

Post #1 of 1 (276 views)
Running into really weird problem Can't Post

I'm trying to parse an html output file from a program called mirdeep. I have gotten the script to the point where the data has been extracted into an array of arrays and I am trying to extract the specific information that I want from the appropriate columns. For some odd reason, I keep getting output that shouldn't be possible.

Bascially it should be pulling the counts from the mirbase miRNAs that were detected by mirdeep AND and the undetected ones, but leave the stat table and novel section alone.

However, the counts currently being pulled for the undetected section are strangely getting stored incorrectly. (Because of the way they were output, Some of the so called "star" and "mature" data were put into separate lines instead of being in the same line, so I am trying to recombine them. I'm fairly certain that an error in the recombination is the source of the problem.

I was checking my progress with print statements and noticed that some of the data that should only have either a mature count or a star count have both, and that the second number seems to be always equal to the mature count of the row below in the html file, which doesnt make sense to me. Can somebody help me track down whats gone wrong? This is my first time using packages for object orientation extensively, so there could be any kind of error in here.

code is below:

Code
#!/usr/bin/perl 
use strict;
use Switch;
use HTML::Entities;
use FileHandle;

my $usage;
$usage = "Usage:\n";
$usage .= "get_mirbase_fromhtml.pl batchfile\n";
$usage .= "batch file line format: sample_label (tab) path_to_file\n";
our $batchfile = shift or die $usage;
our %paths; #store tag=>file here
our %parsed_Files
; #tag=>result of parsing goes here; Contains 2Dim Arrays of string
our %data; #tag=>desired data for output;
our %mirnas
; #master list of mirnas, each file will have a list with counts in it, this one is just name=>1 to keep record

#read batch file
readbatch();

#execute parse on html files
do_parsing();

#get desired data from parsed tables
use_data();

#do any transformations/replacements (blanks with zeroes, etc.)

#output data

exit 0;

sub use_data {
my $file;
my $data;
my $df;
my $i;
my ( $predmirb_t, $undetmirb_t );

foreach ( sort( keys(%paths) ) ) {
my $f=$_;
$file = $parsed_Files{$_};
$df = DataFormat->new();
$predmirb_t = $file->{predmirb}->{table};
$undetmirb_t = $file->{undetmirb}->{table};
#print $_,"\n";
#predicted
foreach ( $i = 0 ; $i <= $#$predmirb_t ; ++$i ) {
if ($i) {
$data = miRNA_data->new();
$data->{ID} = $$predmirb_t[$i][0];
$data->{total} = $$predmirb_t[$i][4];
$data->{mature} = $$predmirb_t[$i][5];
$data->{loop} = $$predmirb_t[$i][6];
$data->{star} = $$predmirb_t[$i][7];
$data->{mirbase_name} = $$predmirb_t[$i][9];

#store in format structure
$df->{predmirb}->{ $data->{mirbase_name} . '|' . $data->{ID} } =
$data;
}
}

#undetected
my ($star);
my ( $last1, $last3 );
my $name;
my (%counts);
foreach ( $i = 0 ; $i <= $#$undetmirb_t ; ++$i ) {
if ($i) {
$last1 = substr( $$undetmirb_t[$i][9], -1 );
$last3 = substr( $$undetmirb_t[$i][9], -3 );

if ( $last1 eq '*' || $last3 eq '-3p' ) {
#mirdeep thinks 3p is star
#print "This is a star seq!\n";
$star=1;
}
elsif ( $last3 eq '-5p' || $last1 ne '*' )
{
# normally would have to check for -3p but that was already done
#mirdeep thinks 5p is mature
#print "This is a mature seq!\n";
$star=0;
}
else { # should not be possible to get here
print "Column 1: $$undetmirb_t[$i][0]\tColumn 10: $$undetmirb_t[$i][9]\n";
die( "Something is wrong!! Sequence found doesn't seem to be mature or star\n");
}
if($last1 eq '*'){
$name = substr($$undetmirb_t[$i][9],0,-1);
#print $$undetmirb_t[$i][9],"\t",$name,"\n";
} elsif($last3 eq '-5p'||$last3 eq '-3p'){
$name = substr($$undetmirb_t[$i][9],0,-3);
#print $$undetmirb_t[$i][9],"\t",$name,"\n";
} else {
$name =$$undetmirb_t[$i][9];
}
print $name,"\n";
#determine if precursor is already marked in hash, if it is, get reference, if not, make new one and store it
# if (!defined($df->{undetmirb}->{$$undetmirb_t[$i][9]. '|'. $$undetmirb_t[$i][0] }) )
if (!defined($df->{undetmirb}->{$name}) )
{
$data = miRNA_data->new();
$data->{ID} = $$undetmirb_t[$i][0];
$data->{mirbase_name} = $$undetmirb_t[$i][9];
$data->{short_name} = $name;
$data->{total} = $$undetmirb_t[$i][4];
$data->{loop} = '';
if($star){
$data->{star} = $$undetmirb_t[$i][5];
$data->{mature} = '';
} else {
$data->{mature} = $$undetmirb_t[$i][5];
$data->{star} = '';
}
# $df->{undetmirb} ->{ $data->{mirbase_name} . '|' . $data->{ID} } = $data;
$df->{undetmirb} ->{$name } = $data;
$counts{$name}=0;
}
++$counts{$name};
if($counts{$name}>2) {
print $f,"\t",$$undetmirb_t[$i][0],"\n";
print "Name used: $name\n";
die "Same miRNA detected more than 2 times! (star & mature)\n";
}
if($counts{name}==2){
print "Duplicate Detected! \t",$f,"\t",$$undetmirb_t[$i][0],"\n";
}
#determine if line represents star or mature & store count in appropriate place
if($star == 1){
$data->{star} = $$undetmirb_t[$i][5];
} elsif($star == 0) {
$data->{mature} = $$undetmirb_t[$i][5];
}
}
}

#undetected followup: Calc totals
foreach(sort(keys(%{$df->{undetmirb}}))){
my $d=$df->{undetmirb}->{$_};
print $d->{mirbase_name}."\t".$d->{short_name}."\t".$d->{mature}."\t".$d->{star}."\t".$d->{total}."\n";
}
#consolidate Hashes together

%data->{$_} = $df; #store constructed data set
}
}



sub readbatch {
my $f = FileHandle->new( '<' . $batchfile );
$f or die "File $f did not open!\n";
my @lines = <$f>;
chomp(@lines);
my @splt;
foreach (@lines) {
@splt = split( /\t/, $_ );
if ( $#splt + 1 != 2 ) { die "Bad Batch File Format!\n"; }
$paths{ $splt[0] } = $splt[1];
}
undef $f;
}

sub do_parsing {
my $p;
foreach ( sort( keys(%paths) ) ) {
print "Label: $_ File: $paths{$_}\n";
$p = new MirDeepParse;
$p->parse_file( $paths{$_} );
$parsed_Files{$_} = $p->getResult();
}
}

sub printTable {
my $table = shift or die "No table given to printTable!\n";
foreach (@$table) {
foreach (@$_) {
print $_, "\t";
}
print "\n";
}
}

package miRNA_data;

sub new {
my $class = shift;
my $self = {
ID => '', #tag ID / provisional ID/precursor name (for undetected seqs)
mirbase_name => '', #name of miRNA
short_name=>'', #mirbase name with */-3p/-5p removed
total => 0,
mature => 0,
loop => 0,
star => 0,
duplicates => 0 #bool will be 1 if duplicates are found

};
bless $self, $class;
return $self;
}

package DataFormat;

sub new {
my $class = shift;
my $self = {
label => shift,
predmirb => {}
, #will store miRNA_data entries from predicted mirbase set
undetmirb => {}
, #will store miRNA_data entries from undetected mirbase set
mirb =>
{} #This will be the consolidation point keys constructed from mirbase_name|ID
};
bless( $self, $class ); #turns var into an object
}

# define the subclass
package MirDeepParse;
use base "HTML::Parser";
our $retval;
our $row;
our $col;
our $cell_content;
our $append_to_cell;
our $inside_table;
our $outside_row;
our $outside_row_started;

#arrays of data
#state flags (for position in tables)
#states:
#'' -- initial state; Vars need initializing;
#0.33--waiting for marker text;
#0.67 -- found marker text for statistics table; waiting for table tags
#1 -- in stat table
#1.33 -- Table tags indicate end of table; waiting for marker text
#1.67 -- found marker for novel; waiting for table tags;
#2 -- in novel table
#2.33 -- Table tags indicate end of table; waiting for marker text
#2.67 -- found marker for predicted mirbase; waiting for table tags;
#3 -- in predicted mirbase table
#3.33 -- Table tags indicate end of table; waiting for marker text
#3.67 -- found marker for undetected mirbase; waiting for table tags;
#4 -- in undetected mirbase table
#4.33 -- Table tags indicate end of table; waiting for EOF

#column id depends on table:
# stat table columns:
#title row: (blank), novel miRNAs[3], known miRBase miRNAs[2], (blank), blank
#rest: miRDeep2 score, predicted by miRDeep2, estimated false positives, estimated true positives, in species, in data, detected by miRDeep2, estimated signal-to-noise, excision gearing
# novel & predicted mirbase tables columns
# provisional id, miRDeep2 score, estimated probability that the miRNA candidate is a true positive, rfam alert, total read count, mature read count, loop read count, star read count, significant randfold p-value, miRBase miRNA, example miRBase miRNA with the same seed, UCSC browser, NCBI blastn, consensus mature sequence, consensus star sequence, consensus precursor sequence
# undetected table columns
# miRBase precursor id, -, -, -, total read count, mature read count(s), -, star read count, remaining reads, -, -, UCSC browser, NCBI blastn, miRBase mature sequence(s), miRBase star sequence(s), miRBase precursor sequence
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
$self->reset_vars(@_);

#our @stats;
#our @novel;
#our @predicted;
#our @undetected;
return $self;
}

sub getResult {
my $self = shift;
my $t = $retval;
undef $retval;
return $t;
}

sub reset_vars {
my $self = shift;
$self->{state} = 0.33;
$retval = mirdeep_html->new();
$row = 0;
$col = 0;
$cell_content = '';
$append_to_cell = 0;
$outside_row = 1;
$outside_row_started = 0;
$inside_table = 0;
}

sub getState {
my $self = shift;
return $self->{state};
}

sub text {
my ( $self, $text ) = @_;

# just print out the original text
switch ( $self->{state} ) {

#print "Current State: $state\n";
#print "$text\n";
case (0) { die("Parsing Attempted before Initializing vars\n"); }

case (0.33) {
if ( $text eq
"Survey of miRDeep2 performance for score cut-offs -10 to 10" )
{

#print "STAT TABLE MARKER FOUND!\n";
$self->{state} = 0.67;
return;
}
}
case (0.67) { }
case (1) { $self->checktext( $retval->{stats}, $text ); }
case (1.33) {
if ( $text eq "novel miRNAs predicted by miRDeep2" ) {

#print "NOVEL TABLE MARKER FOUND!\n";
$self->{state} = 1.67;
return;
}
}
case (1.67) { }
case (2) { $self->checktext( $retval->{novel}, $text ); }
case (2.33) {
if ( $text eq "miRBase miRNAs in dataset" ) {

#print "PREDICTED MIRBASE TABLE MARKER FOUND!\n";
$self->{state} = 2.67;
return;
}

}
case (2.67) { }
case (3) { $self->checktext( $retval->{predmirb}, $text ); }
case (3.33) {
if ( $text eq "miRBase miRNAs not detected by miRDeep2" ) {

#print "UNDETECTED MIRBASE TABLE MARKER FOUND!\n";
$self->{state} = 3.67;
return;
}

}
case (3.67) { }
case (4) { $self->checktext( $retval->{undetmirb}, $text ); }
case (4.33) { }
else { die "Unrecognized state encountered!\n"; }
}

#print $text;
}

sub checktext {
my $self = shift;
my $curtable = shift;
my $text = shift;
if ($append_to_cell) {
$cell_content .= $text;

#print $text;
}
}

sub comment {
my ( $self, $comment ) = @_;

#Nothing to do, html comments not important here
}

sub start {
my ( $self, $tag, $attr, $attrseq, $origtext ) = @_;

# print out original text
#print $origtext;
switch ( $self->{state} ) {
case (0) { die("Parsing Attempted before Initializing vars\n"); }
case (0.33) { }
case (0.67) {
if ( $tag eq 'table' ) {
$self->{state} = 1;
$inside_table = 1;
return;
}
}
case (1) { $self->checktagStart( $retval->{stats}, $tag ); }
case (1.33) { }
case (1.67) {
if ( $tag eq 'table' ) {
$self->{state} = 2;
$inside_table = 1;
return;
}
}
case (2) { $self->checktagStart( $retval->{novel}, $tag ); }
case (2.33) { }
case (2.67) {
if ( $tag eq 'table' ) {
$self->{state} = 3;
$inside_table = 1;
return;
}
}
case (3) { $self->checktagStart( $retval->{predmirb}, $tag ); }
case (3.33) { }
case (3.67) {
if ( $tag eq 'table' ) {
$self->{state} = 4;
$inside_table = 1;
return;
}
}
case (4) { $self->checktagStart( $retval->{undetmirb}, $tag ); }
case (4.33) { }
else { die "Unrecognized state encountered!\n"; }
}
}

sub end {
my ( $self, $tag, $origtext ) = @_;

# print out original text
#print $origtext;
switch ( $self->{state} ) {
case (0) { die("Parsing Attempted before Initializing vars\n"); }
case (0.33) { }
case (0.67) { }
case (1) { $self->checktagEnd( $retval->{stats}, 1.33, $tag ); }
case (1.33) { }
case (1.67) { }
case (2) { $self->checktagEnd( $retval->{novel}, 2.33, $tag ); }
case (2.33) { }
case (2.67) { }
case (3) { $self->checktagEnd( $retval->{predmirb}, 3.33, $tag ); }
case (3.33) { }
case (3.67) { }
case (4) { $self->checktagEnd( $retval->{undetmirb}, 4.33, $tag ); }
case (4.33) { }
else { die "Unrecognized state encountered!\n"; }
}
}

sub checktagStart {
my $self = shift;
my $curtable = shift;
my $tag = shift;
if ( $tag eq 'tr' ) {
$outside_row = 0;
$outside_row_started = 0;

#print "<tr>\n";
++$row;
$curtable->addrow();

#print STDERR "-TR TAG STARTED-\n";
}
elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) {
if ( $outside_row && !$outside_row_started ) {
$outside_row_started = 1;
++$row;
$curtable->addrow();
}
if ($append_to_cell) {
$cell_content = decode($cell_content);

#print "inside a table?: $inside_table\n";
#print "outside row?: $outside_row\n";
#print "outside row started?: $outside_row_started;\n";
$curtable->addcell($cell_content);
$cell_content = getstring();
}

#print "\t<td>";
$append_to_cell = 1;
++$col;

#print STDERR "-TD TAG STARTED-\n";
}
elsif ( $tag eq 'span' ) {
$append_to_cell = 0;
}
}

sub checktagEnd {
my $self = shift;
my $curtable = shift;
my $nextstate = shift;
my $tag = shift;
if ( $tag eq "table" ) {

#print "TABLE TAG END!\n";
$self->{state} = $nextstate;
$inside_table = 0;
$outside_row_started = 0;

#print "State Changed to $nextstate \n";
return;
}
elsif ( $tag eq 'tr' ) {
$outside_row = 1;

#print "</tr>\n";
#print STDERR "-TR TAG ENDED-\n";
}
elsif ( $inside_table && ( $tag eq 'td' || $tag eq 'th' ) ) {

#print "</td>\n";
$append_to_cell = 0;
$cell_content = decode($cell_content);

#print "inside a table?: $inside_table\n";
#print "outside row?: $outside_row\n";
#print "outside row started?: $outside_row_started;\n";

$curtable->addcell($cell_content);

#print STDERR $cell_content,"\n";
$cell_content = getstring();

#print STDERR $cell_content,"\n";
#print STDERR "-TD TAG ENDED-\n";
}
elsif ( $tag eq 'span' ) {
$append_to_cell = 1;
}
}

sub getstring {
my $str = '';
return $str;
}

sub decode {
my $str = shift;
$str =~ s/&#([0123456789]+)/getstrequiv($1)/eg;
$str = HTML::Entities::decode_entities($str);
return $str;
}

sub getstrequiv {
my $num = shift;
switch ($num) {
case (177) {
return
'+/-'; #since the single char is non-ascii, better to replace it
}
case (160) {
return '&nbsp'; #it'll be handled by decode_entities
}
else {
die "unrecognized code: $num!";
}
}
}

package mirdeep_html;

sub new {
my $class = shift;
my $self = {
stats => data_table->new(),
novel => data_table->new(),
predmirb => data_table->new(),
undetmirb => data_table->new()
};
bless $self, $class;
return $self;
}

package data_table;

sub new {
my $class = shift;
my $self = {
table => getarray(),
col => -1,
row => -1
};
bless $self, $class;
return $self;
}

sub getarray {
my @arr;
return \@arr;
}

sub addcell {
if ( $#_ + 1 != 2 ) {
die "Wrong number of arguments to addcell! Need: 2, Given: " . @_
. " \n";
}
my $self = shift;
my $item = shift;
if ( !defined( ${ $self->{table} }[ $self->{row} ] ) ) {
die "Row referenced by index $self->{row} not defined!\n";
}

push @{ $self->{table}[ $self->{row} ] }, $item;
++$self->{col};
}

sub addrow {
my $self = shift;
my @row;
push( @{ $self->{table} }, \@row );
++$self->{row};
$self->{col} = -1;
}

Attachments: example_data.tgz (33.7 KB)

 
 


Search for (options) Powered by Gossamer Forum v.1.2.0

Web Applications & Managed Hosting Powered by Gossamer Threads
Visit our Mailing List Archives