#!/usr/bin/perl -w
use strict;
my $line;
my @array;
my $file;
my @files;
my $counter;
my $normalName;
my $yName;
my %type_normalName_cutoff_chromosome_positions;
my %type_normalName_cutoff_chromosomes;
my @chromosomes;
my @positions;
my $position;
my $chromosome;
my $type;
my $cutoff;
my $path;
my @types;
my %seen_type;
my @cutoffs;
my %seen_cutoff;
my %type_cliques;
my @cliques;
my $clique;
my %type_clique_normalNames;
my @normalNames;
my %type_clique_seen;
my %type_clique_cutoff_chromosomes;
my %type_clique_cutoff_chromosome_positions;
my %type_clique_cutoff_chromosome_position_seen;
my %type_clique_cutoff_chromosome_seen;
my %type_clique_cutoff_chromosome_position_count;
my %chromosome_add;
my $genomicPosition;
my %chromosome_length;
my $genomeLength;
my $outputFile;
my $count;
my $bin;
my $binSize;
my $highestBin;
$binSize = 20000;
$chromosome_length{1} = 230208;
$chromosome_length{2} = 813178;
$chromosome_length{3} = 316616;
$chromosome_length{4} = 1531916;
$chromosome_length{5} = 576869;
$chromosome_length{6} = 270148;
$chromosome_length{7} = 1090946;
$chromosome_length{8} = 562642;
$chromosome_length{9} = 439885;
$chromosome_length{10} = 745666;
$chromosome_length{11} = 666454;
$chromosome_length{12} = 1078174;
$chromosome_length{13} = 924429;
$chromosome_length{14} = 784331;
$chromosome_length{15} = 1091287;
$chromosome_length{16} = 948062;
$chromosome_add{1} = 0;
$chromosome = 2;
while ($chromosome <= 16) {
$chromosome_add{$chromosome} = $chromosome_add{$chromosome - 1} + $chromosome_length{$chromosome - 1};
$chromosome++;
}
$genomeLength = 0;
$chromosome = 1;
while ($chromosome <= 16) {
$genomeLength += $chromosome_length{$chromosome};
$chromosome++;
}
print "the length of the entire genome is $genomeLength\n";
$highestBin = (int(($genomeLength - 1)/$binSize)) + 1;
print "the highest bin number is $highestBin\n";
$path = "/Users/eric/post_disaster_mass_spec/consolidate_results/";
opendir (DIR, "$path");
@array = readdir DIR;
closedir DIR;
foreach $file (@array) {
if ($file =~ /^gold_standard_\S+051809_1\.txt$/) {
#print "$file\n";
$file = "$path"."$file";
push (@files, $file);
}
}
foreach $file (@files) {
if ($file =~ /gold_standard_(\S+)_cutoff0p0(\d)_051809_1\.txt$/) {
$type = $1;
$cutoff = $2;
#if ($type eq "peptide") {
# $type = "protein";
#}
unless ($seen_type{$type}) {
push (@types, $type);
$seen_type{$type} = 1;
}
unless ($seen_cutoff{$cutoff}) {
push (@cutoffs, $cutoff);
$seen_cutoff{$cutoff} = 1;
}
}
else {
die "unexpected file name: $file\n";
}
open (INPUT, "<$file") or die "can't find $file\n";
while ($line = ) {
chomp $line;
if ($line =~ /\r/) {
die "possible new line problem\n";
}
if ($line =~ /\S/) {
@array = split /\t/, $line;
unless ($#array == 28) {
die "problem splitting this line into 29 elements ($#array):\n$line\n";
}
if ($line =~ /^type_of_measurement\t/) {
print "$array[0]\t$array[2]\t$array[5]\t$array[6]\n";
}
else {
$type = $array[0];
$normalName = $array[2];
$chromosome = $array[5];
$position = $array[6];
push (@{ $type_normalName_cutoff_chromosome_positions{$type}{$normalName}{$cutoff}{$chromosome} }, $position);
push (@{ $type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff} }, $chromosome);
}
}
}
close INPUT;
}
@types = sort { lc($a) cmp lc($b) } @types;
@cutoffs = sort { $a <=> $b } @cutoffs;
@array = ();
@files = ();
$path = "./";
opendir (DIR, "$path");
@array = readdir DIR;
closedir DIR;
foreach $file (@array) {
if ($file =~ /^complete_graphs_(\S+)s_070309_3\.txt$/) {
print "$file\n";
push (@files, $file);
}
}
foreach $file (@files) {
if ($file =~ /^complete_graphs_(\S+)s_070309_3\.txt$/) {
#$type = $1;
}
else {
die "unexpected file name: $file\n";
}
open (INPUT, "<$file") or die "can't find $file\n";
while ($line = ) {
chomp $line;
if ($line =~ /\r/) {
die "possible new line problem\n";
}
if ($line =~ /\S/) {
@array = split /\t/, $line;
unless ($#array == 6) {
die "problem splitting this line into 7 elements ($#array):\n$line\n";
}
if ($line =~ /^type_of_measurement\t/) {
print "$array[0]\t$array[1]\t$array[4]\n";
}
else {
$type = $array[0];
$clique = $array[1];
$normalName = $array[4];
unless ($type_clique_seen{$type}{$clique}) {
push (@{ $type_cliques{$type} }, $clique);
$type_clique_seen{$type}{$clique} = 1;
}
foreach $cutoff (@cutoffs) {
#type_normalName_cutoff_chromosome_positions
if ($type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff}) {
@chromosomes = @{ $type_normalName_cutoff_chromosomes{$type}{$normalName}{$cutoff} };
@chromosomes = sort { $a <=> $b } @chromosomes;
foreach $chromosome (@chromosomes) {
unless ($type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome}) {
push (@{ $type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome} }, $chromosome);
$type_clique_cutoff_chromosome_seen{$type}{$clique}{$chromosome} = 1;
}
@positions = @{ $type_normalName_cutoff_chromosome_positions{$type}{$normalName}{$cutoff}{$chromosome} };
@positions = sort { $a <=> $b } @positions;
foreach $position (@positions) {
unless ($type_clique_cutoff_chromosome_position_seen{$type}{$clique}{$cutoff}{$chromosome}{$position}) {
push (@{ $type_clique_cutoff_chromosome_positions{$type}{$clique}{$cutoff}{$chromosome}{$position} }, $position);
$type_clique_cutoff_chromosome_position_seen{$type}{$clique}{$cutoff}{$chromosome}{$position} = 1;
}
$type_clique_cutoff_chromosome_position_count{$type}{$clique}{$cutoff}{$chromosome}{$position}++;
}
}
}
else {
}
}
}
}
}
close INPUT;
}
open (OUTPUT1, ">cliques_mapped_bin_counts_072109_1.txt");
open (OUTPUT2, ">cliques_mapped_bin_counts_072109_1.xls");
print OUTPUT1 "type_of_measurement\tclique\tcutoff_percent\tchromosome\tposition\tgenomic_position\t$binSize"."_bin\thighest_position\thighest_bin_number_starting_with_bin_number_1\n";
print OUTPUT2 "type_of_measurement\tclique\tcutoff_percent\tchromosome\tposition\tgenomic_position\t$binSize"."_bin\thighest_position\thighest_bin_number_starting_with_bin_number_1\n";
foreach $type (@types) {
@cliques = @{ $type_cliques{$type} };
@cliques = sort { $a <=> $b } @cliques;
foreach $clique (@cliques) {
foreach $cutoff (@cutoffs) {
if ($type_clique_cutoff_chromosomes{$type}{$clique}{$cutoff}) {
@chromosomes = @{ $type_clique_cutoff_chromosomes{$type}{$clique}{$cutoff} };
@chromosomes = sort { $a <=> $b } @chromosomes;
foreach $chromosome (@chromosomes) {
@positions = @{ $type_clique_cutoff_chromosome_positions{$type}{$clique}{$chromosome} };
@positions = sort { $a <=> $b } @positions;
foreach $position (@positions) {
$genomicPosition = $position + $chromosome_add{$chromosome};
$count = $type_clique_cutoff_chromosome_position_count{$type}{$clique}{$cutoff}{$chromosome}{$position};
$bin = (int(($genomicPosition - 1)/$binSize)) + 1;
print OUTPUT1 "$type\t$clique\t$cutoff\t$chromosome\t$position\t$genomicPosition\t$bin\t$genomeLength\t$highestBin\n";
print OUTPUT2 "$type\t$clique\t$cutoff\t$chromosome\t$position\t$genomicPosition\t$bin\t$genomeLength\t$highestBin\n";
}
}
}
}
}
}
close OUTPUT1;
close OUTPUT2;
print "done!\n";