#!/usr/bin/perl -w use strict; my $line; my @array; my $file; my @files; my $counter; my $normalName; my $yName; my %type_normalName_cutoff_chromosome_positions; my %type_normalName_cutoff_chromosomes; my @chromosomes; my @positions; my $position; my $chromosome; my $type; my $cutoff; my $path; my @types; my %seen_type; my @cutoffs; my %seen_cutoff; my %type_cliques; my @cliques; my $clique; my %type_clique_normalNames; my @normalNames; my %type_clique_seen; my %type_clique_cutoff_chromosomes; my %type_clique_cutoff_chromosome_positions; my %type_clique_cutoff_chromosome_position_seen; my %type_clique_cutoff_chromosome_seen; my %type_clique_cutoff_chromosome_position_count; my %chromosome_add; my $genomicPosition; my %chromosome_length; my $genomeLength; my $outputFile; my $count; my $bin; my $binSize; my $highestBin; $binSize = 20000; $chromosome_length{1} = 230208; $chromosome_length{2} = 813178; $chromosome_length{3} = 316616; $chromosome_length{4} = 1531916; $chromosome_length{5} = 576869; $chromosome_length{6} = 270148; $chromosome_length{7} = 1090946; $chromosome_length{8} = 562642; $chromosome_length{9} = 439885; $chromosome_length{10} = 745666; $chromosome_length{11} = 666454; $chromosome_length{12} = 1078174; $chromosome_length{13} = 924429; $chromosome_length{14} = 784331; $chromosome_length{15} = 1091287; $chromosome_length{16} = 948062; $chromosome_add{1} = 0; $chromosome = 2; while ($chromosome <= 16) { $chromosome_add{$chromosome} = $chromosome_add{$chromosome - 1} + $chromosome_length{$chromosome - 1}; $chromosome++; } $genomeLength = 0; $chromosome = 1; while ($chromosome <= 16) { $genomeLength += $chromosome_length{$chromosome}; $chromosome++; } print "the length of the entire genome is $genomeLength\n"; $highestBin = (int(($genomeLength - 1)/$binSize)) + 1; print "the highest bin number is $highestBin\n"; $path = "/Users/eric/post_disaster_mass_spec/consolidate_results/"; opendir (DIR, "$path"); @array = readdir DIR; closedir DIR; foreach $file (@array) { if ($file =~ /^gold_standard_\S+051809_1\.txt$/) { #print "$file\n"; $file = "$path"."$file"; push (@files, $file); } } foreach $file (@files) { if ($file =~ /gold_standard_(\S+)_cutoff0p0(\d)_051809_1\.txt$/) { $type = $1; $cutoff = $2; #if ($type eq "peptide") { # $type = "protein"; #} unless ($seen_type{$type}) { push (@types, $type); $seen_type{$type} = 1; } unless ($seen_cutoff{$cutoff}) { push (@cutoffs, $cutoff); $seen_cutoff{$cutoff} = 1; } } else { die "unexpected file name: $file\n"; } open (INPUT, "<$file") or die "can't find $file\n"; while ($line = ) { chomp $line; if ($line =~ /\r/) { die "possible new line problem\n"; } if ($line =~ /\S/) { @array = split /\t/, $line; unless ($#array == 28) { die "problem splitting this line into 29 elements ($#array):\n$line\n"; } if ($line =~ /^type_of_measurement\t/) { print "$array[0]\t$array[2]\t$array[5]\t$array[6]\n"; } else { $type = $array[0]; $normalName = $array[2]; $chromosome = $array[5]; $position = $array[6]; push (@{ $type_normalName_cutoff_chromosome_positions{$type}{$normalName}{$cutoff}{$chromosome} }, $position); push (@{ $type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff} }, $chromosome); } } } close INPUT; } @types = sort { lc($a) cmp lc($b) } @types; @cutoffs = sort { $a <=> $b } @cutoffs; @array = (); @files = (); $path = "./"; opendir (DIR, "$path"); @array = readdir DIR; closedir DIR; foreach $file (@array) { if ($file =~ /^complete_graphs_(\S+)s_070309_3\.txt$/) { print "$file\n"; push (@files, $file); } } foreach $file (@files) { if ($file =~ /^complete_graphs_(\S+)s_070309_3\.txt$/) { #$type = $1; } else { die "unexpected file name: $file\n"; } open (INPUT, "<$file") or die "can't find $file\n"; while ($line = ) { chomp $line; if ($line =~ /\r/) { die "possible new line problem\n"; } if ($line =~ /\S/) { @array = split /\t/, $line; unless ($#array == 6) { die "problem splitting this line into 7 elements ($#array):\n$line\n"; } if ($line =~ /^type_of_measurement\t/) { print "$array[0]\t$array[1]\t$array[4]\n"; } else { $type = $array[0]; $clique = $array[1]; $normalName = $array[4]; unless ($type_clique_seen{$type}{$clique}) { push (@{ $type_cliques{$type} }, $clique); $type_clique_seen{$type}{$clique} = 1; } foreach $cutoff (@cutoffs) { #type_normalName_cutoff_chromosome_positions if ($type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff}) { @chromosomes = @{ $type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff} }; @chromosomes = sort { $a <=> $b } @chromosomes; foreach $chromosome (@chromosomes) { unless ($type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome}) { push (@{ $type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome} }, $chromosome); $type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome} = 1; } @positions = @{ $type_normalName_cutoff_chromosome_positions{$type}{$normalName}{$cutoff}{$chromosome} }; @positions = sort { $a <=> $b } @positions; foreach $position (@positions) { unless ($type_clique_cutoff_chromosome_position_seen{$type}{$clique}{$cutoff}{$chromosome}{$position}) { push (@{ $type_clique_cutoff_chromosome_positions{$type}{$clique}{$cutoff}{$chromosome}{$position} }, $position); $type_clique_cutoff_chromosome_position_seen{$type}{$clique}{$cutoff}{$chromosome}{$position} = 1; } $type_clique_cutoff_chromosome_position_count{$type}{$clique}{$cutoff}{$chromosome}{$position}++; } } } else { } } } } } close INPUT; } open (OUTPUT1, ">cliques_mapped_bin_counts_072109_1.txt"); open (OUTPUT2, ">cliques_mapped_bin_counts_072109_1.xls"); print OUTPUT1 "type_of_measurement\tclique\tcutoff_percent\tchromosome\tposition\tgenomic_position\t$binSize"."_bin\thighest_position\thighest_bin_number_starting_with_bin_number_1\n"; print OUTPUT2 "type_of_measurement\tclique\tcutoff_percent\tchromosome\tposition\tgenomic_position\t$binSize"."_bin\thighest_position\thighest_bin_number_starting_with_bin_number_1\n"; foreach $type (@types) { @cliques = @{ $type_cliques{$type} }; @cliques = sort { $a <=> $b } @cliques; foreach $clique (@cliques) { foreach $cutoff (@cutoffs) { if ($type_clique_cutoff_chromosomes{$type}{$clique}{$cutoff}) { @chromosomes = @{ $type_clique_cutoff_chromosomes{$type}{$clique}{$cutoff} }; @chromosomes = sort { $a <=> $b } @chromosomes; foreach $chromosome (@chromosomes) { @positions = @{ $type_clique_cutoff_chromosome_positions{$type}{$clique}{$chromosome} }; @positions = sort { $a <=> $b } @positions; foreach $position (@positions) { $genomicPosition = $position + $chromosome_add{$chromosome}; $count = $type_clique_cutoff_chromosome_position_count{$type}{$clique}{$cutoff}{$chromosome}{$position}; $bin = (int(($genomicPosition - 1)/$binSize)) + 1; print OUTPUT1 "$type\t$clique\t$cutoff\t$chromosome\t$position\t$genomicPosition\t$bin\t$genomeLength\t$highestBin\n"; print OUTPUT2 "$type\t$clique\t$cutoff\t$chromosome\t$position\t$genomicPosition\t$bin\t$genomeLength\t$highestBin\n"; } } } } } } close OUTPUT1; close OUTPUT2; print "done!\n";