#!/usr/bin/perl use strict; use warnings; use Bio::SeqIO; use Bio::Tools::SeqStats; use DBI; my $dbh = DBI->connect("dbi:SQLite:dbname=junk.lite","","", {PrintError => 1, AutoCommit => 0}) or die "Can't connect"; $dbh->do(q {DROP TABLE IF EXISTS skew}); $dbh->do(qq{ CREATE TABLE skew (Sequence_id text, begin int, end int, G int, C int, skew real) }); my $sql_fmt = "INSERT INTO skew VALUES(?,?,?,?,?,?)"; my $window_size = 200; my $in_file = "fasta_dat.txt"; my $in = Bio::SeqIO->new (-file=> $in_file, -format=>'fasta'); while(my $seq = $in->next_seq() ) { my $length = $seq->length; my $last = 0; for (my $begin_pos = 0; $last == 0; $begin_pos += 100) { my $end_pos; if ($begin_pos + $window_size >= $length) { $end_pos = $length; $last = 1; } else { $end_pos = $begin_pos + $window_size; } my $subseq = $seq->trunc(1 + $begin_pos, $end_pos); my $seq_stats = Bio::Tools::SeqStats->new(-seq => $subseq); my $count = $seq_stats->count_monomers(); my $nr_G = $count->{G}; my $nr_C = $count->{C} or next; # check for illegal division by 0 my $size_read = $end_pos - $begin_pos; my $skew = ($size_read / $nr_C)*($nr_G - $nr_C)/($nr_G + $nr_C); $dbh->do($sql_fmt, {}, ($seq->id, $begin_pos+1, $end_pos, $nr_G, $nr_C, $skew)); } } $dbh->commit; my @cols = qw/ Sequence_id Begin End G C Skew /; my $cols = join ",", @cols; my $hdr = join "\t", @cols; my $num_return = 20; my @stmnt = split /\n/, <prepare($sql); $sth->execute; print $hdr, "\n"; while(my @row = $sth->fetchrow_array) { print join("\t", @row), "\n"; } print "\n"; } $dbh->disconnect; __END__ *** printout 20 lowest - 20 highest - and all 0 skews C:\Old_Data\perlp\bioperl>perl bio_stat_db.pl Sequence_id Begin End G C Skew NM_198399 2301 2413 7 12 -2.4780701754386 NR_002777 701 900 18 70 -1.68831168831169 NR_037701 3201 3353 16 37 -1.63844977052524 NR_002777 801 1000 23 54 -1.49110149110149 NR_037806 501 700 27 58 -1.25760649087221 NR_002777 1101 1277 23 41 -1.21417682926829 NR_037701 2001 2200 29 60 -1.16104868913858 NR_037701 701 900 30 66 -1.13636363636364 NM_198399 2201 2400 22 29 -0.94658553076403 NR_037701 2101 2300 33 57 -0.935672514619883 NR_037806 601 800 34 57 -0.886832465779834 NR_037701 1201 1400 32 47 -0.807971990304336 NR_037701 3101 3300 38 65 -0.806572068707991 NR_002777 601 800 34 51 -0.784313725490196 NR_037701 1301 1500 33 48 -0.771604938271605 NR_029429 1 200 40 64 -0.721153846153846 NR_037701 1001 1200 36 52 -0.699300699300699 NR_002777 901 1100 36 49 -0.624249699879952 NR_037701 1101 1300 37 51 -0.623885918003565 NR_037701 1701 1900 41 58 -0.592128178335075 Sequence_id Begin End G C Skew NR_002777 301 500 45 16 5.94262295081967 NR_037701 2301 2500 64 27 3.01180301180301 NR_033769 1101 1300 35 20 2.72727272727273 NR_037701 2401 2600 63 29 2.54872563718141 NR_037701 301 500 50 27 2.21260221260221 NR_037701 2601 2800 41 24 2.17948717948718 NR_002777 201 400 46 27 1.92795535261289 NR_037701 501 700 57 32 1.75561797752809 NR_037701 201 400 50 30 1.66666666666667 NR_027917 501 657 54 29 1.63066057332779 NR_037701 401 600 56 33 1.56622403813415 NR_027917 401 600 64 36 1.55555555555556 NM_198399 201 400 58 34 1.53452685421995 NM_198399 301 500 63 36 1.51515151515152 NM_198399 401 600 52 32 1.48809523809524 NM_198399 1401 1600 43 29 1.34099616858238 NR_027917 101 300 62 38 1.26315789473684 NM_198399 1001 1200 40 28 1.26050420168067 NM_198399 1801 2000 54 35 1.2199036918138 NR_037701 2501 2700 47 32 1.18670886075949 Sequence_id Begin End G C Skew NM_001144931 301 500 46 46 0 NM_016951 401 600 47 47 0 NM_198399 1601 1800 44 44 0