#!/usr/bin/perl ### Haven't put in shift of + or - 1 nucleotide use Math::SpecFun::Gamma qw(gamma gammaln); print "\nTwo-Block BioOptimizer, Version 1.0, is copyrighted to Shane T. Jensen (2003)\n"; @residues1 = (A,C,G,T); @residues2 = (a,c,g,t); #### Fixed Parameters $pi=atan2(1,1)*4; $const = sqrt(2*$pi); $b = 1; # prior for motif counts #### Input Parameters $inseq = $ARGV[0]; $inbiop = $ARGV[1]; $nummotif = $ARGV[2]; $rc = $ARGV[3]; $w10 = $ARGV[4]; $w20 = $ARGV[5]; $mingap = $ARGV[6]; $maxgap = $ARGV[7]; $filename = "$inbiop.opt"; #open (LOG,">$filename.log"); $bestoverallscore = 0; open (SUM,">$filename.sum"); print SUM "Optimization with Poisson Prior with expectation = $w0\n"; print SUM "A good motif should have a final score > null score\n"; print SUM "motif\tinit_score\tdim\t\tA\tconsensus\t\tfinal_score\tdim\t\tA\tconsensus\n"; ####################### Reading in Upstream Regions ######################### @genenames = (); @seqs = (); @seq = (); open (SEQ,"$inseq"); while (){ @line1 = split(//,$_); @line2 = split(/\s+/,$_); if ($line1[0] eq ">"){ push (@genenames,$line2[0]); push (@strand,"f"); if ($rc == 1){ push (@genenames,"$line2[0].r"); # reverse complement push (@strand,"r"); } } if ($line1[0] ne ">"){ chomp $_; push (@seqs,$_); if ($rc == 1){ $revcomp = reverse($_); # reverse complement $revcomp =~ tr/acgt/tgca/; $revcomp =~ tr/ACGT/TGCA/; push (@seqs,$revcomp); } } } @temp = split (//,$seqs[0]); $numgene = @genenames; $numseq = @seqs; if ($numseq != $numgene){ print "Bad Sequence File! Not the same number of gene names as upstream regions \n"; die; } ####### ACGT --> 0123, counting upstream positions and nucs ############ print "$inseq has $numseq sequences\n"; #print "First sequence: $genenames[0]\n"; #print "$seqs[0]\n"; @numpos = (); $totalnumpos = 0; for ($i = 0;$i < $numseq;$i++){ $seqs[$i] =~ tr /acgt/0123/; $seqs[$i] =~ tr /ACGT/0123/; @line2 = split(//,$seqs[$i]); $numpos[$i] = @line2; $totalnumpos = $totalnumpos + $numpos[$i]; for ($j = 0;$j < $numpos[$i];$j++){ $nuc = $line2[$j]; $seq[$i][$j] = $nuc; $N[$nuc] = $N[$nuc] + 1; } } $sumN = sum(@N); print "Total Nucleotides $sumN = (@N)\n"; #for ($j = 0;$j < $numpos[0];$j++){ # print "$seq[0][$j]"; #} #print "\n"; #print "Total number of positions = $totalnumpos\n"; #print "Nucleotides $sumN = ($N[0],$N[1],$N[2],$N[3])\n"; print "Sequences and Number of Positions:\n"; for ($i = 0;$i < $numseq;$i++){ print "$genenames[$i]\t$numpos[$i]\n"; #print LOG "$genenames[$i]\t$numpos[$i]\n"; } ####################### Start of Iterations over Motif ######################### for ($motif = 1; $motif <= $nummotif; $motif++){ # should be 1 to $nummotif $mingap = $ARGV[6]; $maxgap = $ARGV[7]; $curmotifid = "#$motif:"; $temp = $motif + 1; $nextmotifid = "#$temp:"; print "\nOptimization for Motif $curmotifid\n"; ####################### Reading in bioprospector output ######################### ######## Initializing Aind and A ######### $A = 0; for ($i = 0;$i < $numseq;$i++){ for ($j = 0;$j < $numpos[$i];$j++){ $Aind[$i][$j] = 0; } } open (MOTIF,"$inbiop"); $inmotif = 0; $outmotif = 0; while (){ @line = split(/\s+/,$_); @line2 = split(//,$_); $numitems = @line; if ($line[0] eq "Motif" and $line[1] eq $curmotifid){ $inmotif = 1; } if ($inmotif == 1 and $outmotif == 0){ if ($line[0] eq "Width"){ ### getting motif width $w1 = $line[1]; $w1 =~ tr /\(//d; $w1 =~ tr /\,//d; $w2 = $line[2]; $w2 =~ tr /\)//d; $w2 =~ tr /\;//d; } if ($line2[0] eq ">"){ ### getting motif sites $name = $line[0]; for ($k = 0; $k < $numitems; $k++){ if ($line[$k] eq "seg"){ $blk1 = $line[$k+2]; $blk2 = $line[$k+5]; } } @line3 = split(//,$blk1); if ($line3[0] eq "f") { $blk1 =~ tr /f//d; $blk2 =~ tr /f//d; } if ($line3[0] eq "r") { $name = "$line[0].r"; $blk1 =~ tr /r//d; $blk2 =~ tr /r//d; } $start1 = $blk1 - 1; $start2 = $blk2 - 1; $gap = $start2 - ($start1+$w1); if ($gap <= $maxgap and $gap >= $mingap){ for ($i = 0;$i < $numseq;$i++) { if ($name eq $genenames[$i]) { $Aind[$i][$start1] = $gap; $A = $A + 1; #print "Site: Sequence $i ($genenames[$i]) Pos $start1-$gap-$start2 "; #for ($j = 0; $j < $w1; $j++) { # $nuc = $seq[$i][$start1+$j]; # print "$residues1[$nuc]"; #} #print "-"; #for ($j = 0; $j < $w2; $j++) { # $nuc = $seq[$i][$start1+$w1+$gap+$j]; # print "$residues1[$nuc]"; #} #print "\n"; } } } } } if ($line[0] eq "Motif" and $line[1] eq $nextmotifid){ $outmotif = 1; } } ############ Estimating Fixed Background Parameters # @Bind = calculateBind(); for ($k = 0; $k < 4; $k++){ $theta0[$k] = 0; } for ($i = 0;$i < $numseq;$i++){ for ($j = 0;$j < $numpos[$i];$j++){ if ($Bind[$i][$j] == 1){ $nuc = $seq[$i][$j]; $theta0[$nuc] = $theta0[$nuc] + 1; } } } $sumtheta0 = sum(@theta0); for ($k = 0; $k < 4; $k++){ $theta0[$k] = $theta0[$k]/$sumtheta0; } print "$inbiop has $A sites with $w1 and $w2 columns\n"; #print "Sites in $genenames[1]:\n"; #for ($j = 0;$j < $numpos[1];$j++){ # print "$Aind[1][$j]"; #} #print "\n"; #for ($j = 0;$j < $numpos[1];$j++){ # print "$Bind[1][$j]"; #} #print "\n"; @n1 = calculateblock1(); @n2 = calculateblock2(); print "Motif Counts:\n"; for ($j = 0; $j < $w1; $j++){ print "1\t$j\t"; for ($k=0;$k<4;$k++){ print "$n1[$j][$k]\t"; } print "\n"; } print "\n"; for ($j = 0; $j < $w2; $j++){ print "2\t$j\t"; for ($k=0;$k<4;$k++){ print "$n2[$j][$k]\t"; } print "\n"; } print "\n"; print "Background Parameters:\n"; for ($k = 0;$k < 4;$k++){ print "$theta0[$k]\t"; } print "\n"; close (MOTIF); ###################### Begin Iterations !! ################################## $score = truescore(); @n1 = calculateblock1(); @n2 = calculateblock2(); $initscore = $score; print "Initial Score: $initscore\n"; print SUM "$motif\t"; printf SUM "%0.2f\t",$initscore; print SUM "$w1-($mingap-$maxgap)-$w2\t$A\t"; for ($j = 0; $j < $w1; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n1[$j][$k] > $n1[$j][$domnuc]){ $domnuc = $k; } } if ($n1[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n1[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "_"; for ($j = 0; $j < $w2; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n2[$j][$k] > $n2[$j][$domnuc]){ $domnuc = $k; } } if ($n2[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n2[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "\t"; $changemade = 1; $iters = 0; while ($changemade == 1){ $changemade = 0; $iters = $iters + 1; @n1 = calculateblock1(); @n2 = calculateblock2(); @Bind = calculateBind(); $score = truescore(); $Nstar = $sumN - ($w1+$maxgap+$w2-1)*$numseq; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]-($w1+$maxgap+$w2); $pos++){ $oldAind = $Aind[$gene][$pos]; $change = 0; # $test = ""; # for ($j = 0; $j < $w1; $j++){ # $test = "$test$Bind[$gene][$pos+$j]"; # } # $test = "$test-"; # for ($j = 0; $j < $maxgap; $j++){ # $test = "$test$Bind[$gene][$pos+$w1+$j]"; # } # $test = "$test-"; # for ($j = 0; $j < $w2; $j++){ # $test = "$test$Bind[$gene][$pos+$w1+$maxgap+$j]"; # } ### backup current score $oldscore = $score; ### decide whether to add or remove $add = 0; if ($Aind[$gene][$pos] == 0){ $add = 1; } ### calculate difference for adding if ($add == 1){ $bestdiff = 0; $bestgap = 0; for ($gap = $mingap; $gap <= $maxgap; $gap++){ $diff = log(($A+1)/($Nstar-$A)); for ($j = 0; $j < $w1; $j++){ $nuc = $seq[$gene][$pos+$j]; $diff = $diff + log(($n1[$j][$nuc]+$b)/($A+4*$b)); if ($Bind[$gene][$pos+$j] == 1){ $diff = $diff - log($theta0[$nuc]); } } for ($j = 0; $j < $w2; $j++){ $nuc = $seq[$gene][$pos+$w1+$gap+$j]; $diff = $diff + log(($n2[$j][$nuc]+$b)/($A+4*$b)); if ($Bind[$gene][$pos+$w1+$gap+$j] == 1){ $diff = $diff - log($theta0[$nuc]); } } if ($gap == $mingap or $diff > $bestdiff){ $bestgap = $gap; $bestdiff = $diff; } } if ($bestdiff > 0){ $A = $A + 1; $Aind[$gene][$pos] = $bestgap; $change = 1; $changemade = 1; @Bind = calculateBind(); @n1 = calculateblock1(); @n2 = calculateblock2(); $score = truescore(); } } ### calculate difference for subtracting if ($add == 0){ $bestgap = $Aind[$gene][$pos]; $Aind[$gene][$pos] = 0; @Bind = calculateBind(); $Aind[$gene][$pos] = $bestgap; $bestdiff = log(($Nstar-$A+1)/$A); for ($j = 0; $j < $w1; $j++){ $nuc = $seq[$gene][$pos+$j]; $bestdiff = $bestdiff - log(($n1[$j][$nuc]+$b-1)/($A+4*$b-1)); if ($Bind[$gene][$pos+$j] == 1){ $bestdiff = $bestdiff + log($theta0[$nuc]); } } for ($j = 0; $j < $w2; $j++){ $nuc = $seq[$gene][$pos+$w1+$bestgap+$j]; $bestdiff = $bestdiff - log(($n2[$j][$nuc]+$b-1)/($A+4*$b-1)); if ($Bind[$gene][$pos+$w1+$bestgap+$j] == 1){ $bestdiff = $bestdiff + log($theta0[$nuc]); } } if ($bestdiff <= 0){ @Bind = calculateBind(); } if ($bestdiff > 0){ $A = $A - 1; $Aind[$gene][$pos] = 0; $change = 1; $changemade = 1; @Bind = calculateBind(); @n1 = calculateblock1(); @n2 = calculateblock2(); $score = truescore(); } } ### outputting iteration result # if ($change == 1){ # print "$oldAind $Aind[$gene][$pos] $test \n"; # print "$motif\t$iters\t$gene\t$pos\t$add\t$change\t"; # $temp = $score - $oldscore; # printf "%0.4f\t%0.4f\t%0.4f\t%0.4f\t",$oldscore,$score,$temp,$bestdiff; # @firstrow = @{$n1[0]}; # print "$A\t$w1\t$w2\t@firstrow\n"; # $test = ""; # for ($j = 0; $j < $w1; $j++){ # $test = "$test$Bind[$gene][$pos+$j]"; # } # $test = "$test-"; # for ($j = 0; $j < $maxgap; $j++){ # $test = "$test$Bind[$gene][$pos+$w1+$j]"; # } # $test = "$test-"; # for ($j = 0; $j < $w2; $j++){ # $test = "$test$Bind[$gene][$pos+$w1+$maxgap+$j]"; # } # print "$test\n"; # } } # end of loop over positions } # end of loop over genes $score = truescore(); print "Finished scan $iters of sequences |A| = $A \n"; $z = 0; if ($z != 0){ ### For each current site, finding ideal gap width ### for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]-($w1+$maxgap+$w2); $pos++){ if ($Aind[$gene][$pos] > 0){ $oldgap = $Aind[$gene][$pos]; for ($gap = $mingap; $gap <= $maxgap; $gap++){ $Aind[$gene][$pos] = $gap; $score = truescore(); if ($gap == $mingap or $score > $bestscore){ $bestgap = $gap; $bestscore = $score; } } $Aind[$gene][$pos] = $bestgap; $score = $bestscore; print "Gap Change? gene $gene pos $pos oldgap $oldgap bestgap $bestgap\n"; } } } $score = truescore(@n); } ###### Width Change 1: add/remove left of block 1 ### first backup everything $oldscore = $score; print "Block 1 Front old: $w1-($mingap-$maxgap)-$w2 $A $score "; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $oldAind[$gene][$pos] = $Aind[$gene][$pos] } } $oldA = $A; $oldw1 = $w1; for ($increase = 0; $increase <= 1; $increase++){ ### initialize Aind and w and |A| $w1 = $oldw1; $A = 0; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } ### if removing from front, shift sites right by one if ($increase == 0){ $w1 = $w1 - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos+1] = $oldAind[$gene][$pos]; $Aind[$gene][$pos] = 0; $A = $A + 1; } } } } ### if adding to front, shift sites left by one if ($increase == 1){ $w1 = $w1 + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ if ($pos > 0){ ## make sure don't expand beyond start of seq $Aind[$gene][$pos-1] = $oldAind[$gene][$pos]; $A = $A + 1; } $Aind[$gene][$pos] = 0; } } } } $score = truescore(); if ($increase == 0 or $score > $bestscore){ $bestincrease = $increase; $bestscore = $score; } } ### actually making change to front of block 1 if ($oldscore >= $bestscore){ $w1 = $oldw1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $A = $oldA; $score = truescore(); } if ($oldscore < $bestscore){ $changemade = 1; $w1 = $oldw1; $A = 0; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } if ($bestincrease == 0){ $w1 = $w1 - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos+1] = $oldAind[$gene][$pos]; $Aind[$gene][$pos] = 0; $A = $A + 1; } } } } if ($bestincrease == 1){ $w1 = $w1 + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ if ($pos > 0){ ## make sure don't expand beyond start of seq $Aind[$gene][$pos-1] = $oldAind[$gene][$pos]; $A = $A + 1; } $Aind[$gene][$pos] = 0; } } } } $score = truescore(); } print "new: $w1-($mingap-$maxgap)-$w2 $A $score\n"; ###### Width Change 2: add/remove right of block 2 ### first backup everything $oldscore = $score; print "Block 2 Front old: $w1-($mingap-$maxgap)-$w2 $A $score "; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $oldAind[$gene][$pos] = $Aind[$gene][$pos] } } $oldA = $A; $oldw2 = $w2; for ($increase = 0; $increase <= 1; $increase++){ ### initialize Aind and w and |A| $w2 = $oldw2; $A = 0; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } ### if removing from back, do nothing to Aind if ($increase == 0){ $w2 = $w2 - 1; $A = $oldA; } ### if adding to back, make sure don't expand beyond end of sequence if ($increase == 1){ $w2 = $w2 + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $gap = $oldAind[$gene][$pos]; if ($pos + $w1 + $gap + $w2 < $numpos[$gene]){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; $A = $A + 1; } if ($pos + $w1 + $gap + $w2 >= $numpos[$gene]){ $Aind[$gene][$pos] = 0; } } } } } $score = truescore(); if ($increase == 0 or $score > $bestscore){ $bestincrease = $increase; $bestscore = $score; } } ### actually making change to back of block 2 if ($oldscore >= $bestscore){ $w2 = $oldw2; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $A = $oldA; $score = truescore(); } if ($oldscore < $bestscore){ $changemade = 1; $w2 = $oldw2; $A = 0; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } if ($bestincrease == 0){ $w2 = $w2 - 1; $A = $oldA; } if ($bestincrease == 1){ $w2 = $w2 + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $gap = $oldAind[$gene][$pos]; if ($pos + $w1 + $gap + $w2 < $numpos[$gene]){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; $A = $A + 1; } if ($pos + $w1 + $gap + $w2 >= $numpos[$gene]){ $Aind[$gene][$pos] = 0; } } } } } $score = truescore(); } print "new: $w1-($mingap-$maxgap)-$w2 $A $score\n"; ###### Width Change 3: add/remove right of block 1 ### first backup everything, though $A will not change $oldscore = $score; print "Block 1 Back old: $w1-($mingap-$maxgap)-$w2 $A $score "; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $oldAind[$gene][$pos] = $Aind[$gene][$pos] } } $oldw1 = $w1; $oldmingap = $mingap; $oldmaxgap = $maxgap; for ($increase = 0; $increase <= 1; $increase++){ ### initialize Aind and w $w1 = $oldw1; $maxgap = $oldmaxgap; $mingap = $oldmingap; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } ### if removing from back, increase all gaps by one if ($increase == 0){ $w1 = $w1 - 1; $maxgap = $maxgap + 1; $mingap = $mingap + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] + 1; } } } } ### if adding to back, reduce all gaps by one if ($increase == 1){ $w1 = $w1 + 1; $maxgap = $maxgap - 1; $mingap = $mingap - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] - 1; } } } } $score = truescore(); if ($increase == 0 or $score > $bestscore){ $bestincrease = $increase; $bestscore = $score; } } ### actually making change to back of block 1 if ($oldscore >= $bestscore){ $w1 = $oldw1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $maxgap = $oldmaxgap; $mingap = $oldmingap; $score = truescore(); } if ($oldscore < $bestscore){ $changemade = 1; $w1 = $oldw1; $maxgap = $oldmaxgap; $mingap = $oldmingap; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } if ($bestincrease == 0){ $w1 = $w1 - 1; $maxgap = $maxgap + 1; $mingap = $mingap + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] + 1; } } } } if ($bestincrease == 1){ $w1 = $w1 + 1; $maxgap = $maxgap - 1; $mingap = $mingap - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] - 1; } } } } $score = truescore(); } print "new: $w1-($mingap-$maxgap)-$w2 $A $score\n"; ###### Width Change 4: add/remove front of block 2 ### first backup everything, though $A will not change $oldscore = $score; print "Block 2 Front old: $w1-($mingap-$maxgap)-$w2 $A $score "; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $oldAind[$gene][$pos] = $Aind[$gene][$pos] } } $oldw2 = $w2; $oldmingap = $mingap; $oldmaxgap = $maxgap; for ($increase = 0; $increase <= 1; $increase++){ ### initialize Aind and w $w2 = $oldw2; $maxgap = $oldmaxgap; $mingap = $oldmingap; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } ### if removing from front, increase all gaps by one if ($increase == 0){ $w2 = $w2 - 1; $maxgap = $maxgap + 1; $mingap = $mingap + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] + 1; } } } } ### if adding to front, reduce all gaps by one if ($increase == 1){ $w2 = $w2 + 1; $maxgap = $maxgap - 1; $mingap = $mingap - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] - 1; } } } } $score = truescore(); if ($increase == 0 or $score > $bestscore){ $bestincrease = $increase; $bestscore = $score; } } ### actually making change to front of block 2 if ($oldscore >= $bestscore){ $w2 = $oldw2; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $maxgap = $oldmaxgap; $mingap = $oldmingap; $score = truescore(); } if ($oldscore < $bestscore){ $changemade = 1; $w2 = $oldw2; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $maxgap = $oldmaxgap; $mingap = $oldmingap; if ($bestincrease == 0){ $w2 = $w2 - 1; $maxgap = $maxgap + 1; $mingap = $mingap + 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] + 1; } } } } if ($bestincrease == 1){ $w2 = $w2 + 1; $maxgap = $maxgap - 1; $mingap = $mingap - 1; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] > 0){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] - 1; } } } } $score = truescore(); } print "new: $w1-($mingap-$maxgap)-$w2 $A $score\n"; } # end of iterations $nullscore = nullscore(); @n1 = calculateblock1(); @n2 = calculateblock2(); ####################### Opening output files ############################### if ($motif == 1){ open (OUT,">$filename.all"); } if ($motif > 1){ open (OUT,">>$filename.all"); } print OUT "infile: $inbiop motif: $motif numiters:$iters \n"; print OUT "Initial Score = $initscore Final Score = $score \t A = $A \t dim = $w1-($mingap-$maxgap)-$w2\n"; print OUT "The null Score (no sites) for this motif width is $nullscore. A good motif should have a final score > null score\n\n"; print OUT "Block 1: \n"; for ($j = 0; $j < $w1; $j++){ print OUT "$j\t"; for ($k = 0; $k < 4; $k++){ print OUT "$n1[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n1[$j][$k] > $n1[$j][$domnuc]){ $domnuc = $k; } } if ($n1[$j][$domnuc] >= 0.75*$A){ print OUT "$residues1[$domnuc]"; } if ($n1[$j][$domnuc] < 0.75*$A){ print OUT "$residues2[$domnuc]"; } print OUT "\n"; } print OUT "Block 2: \n"; for ($j = 0; $j < $w2; $j++){ print OUT "$j\t"; for ($k = 0; $k < 4; $k++){ print OUT "$n2[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n2[$j][$k] > $n2[$j][$domnuc]){ $domnuc = $k; } } if ($n2[$j][$domnuc] >= 0.75*$A){ print OUT "$residues1[$domnuc]"; } if ($n2[$j][$domnuc] < 0.75*$A){ print OUT "$residues2[$domnuc]"; } print OUT "\n"; } print OUT "\nBackground: \n"; for ($k = 0; $k < 4; $k++){ printf OUT "%0.3f\t",$theta0[$k]; } print OUT "\n"; print OUT "\n\nSites:\n"; for ($gene = 0;$gene < $numseq;$gene++){ for ($pos = 0;$pos < $numpos[$gene];$pos++){ if ($Aind[$gene][$pos] > 0){ $gap = $Aind[$gene][$pos]; $start1 = $pos + 1; # indexing from one to numpos now $start2 = $pos + $w1 + $gap + 1; # indexing from one to numpos now if ($strand[$gene] eq "f"){ print OUT "$genenames[$gene]\tf\t$start1\t$gap\t$start2\t"; } if ($strand[$gene] eq "r"){ print OUT "$genenames[$gene-1]\tr\t$start1\t$gap\t$start2\t"; } for ($j = 0; $j < $w1; $j++){ $nuc = $seq[$gene][$pos+$j]; print OUT "$residues1[$nuc]" } print OUT "_"; for ($j = 0; $j < $w2; $j++){ $nuc = $seq[$gene][$pos+$w1+$gap+$j]; print OUT "$residues1[$nuc]" } print OUT "\n"; } } } print OUT "\n\n"; printf SUM "%0.2f (null= %0.2f)\t",$score,$nullscore; print SUM "$w1-($mingap-$maxgap)-$w2\t$A\t"; for ($j = 0; $j < $w1; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n1[$j][$k] > $n1[$j][$domnuc]){ $domnuc = $k; } } if ($n1[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n1[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "_"; for ($j = 0; $j < $w2; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n2[$j][$k] > $n2[$j][$domnuc]){ $domnuc = $k; } } if ($n2[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n2[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "\n"; ####################### Opening best optimization result file ############### if ($motif == 1 or $score > $bestoverallscore){ $bestoverallscore = $score; open (BEST,">$filename.best"); print BEST "infile: $inbiop motif: $motif numiters:$iters \n"; print BEST "Initial Score = $initscore Final Score = $score \t A = $A \t dim = $w1-($mingap-$maxgap)-$w2\n"; print BEST "The null Score (no sites) for this motif width is $nullscore. A good motif should have a final score > null score\n\n"; print BEST "Block 1: \n"; for ($j = 0; $j < $w1; $j++){ print BEST "$j\t"; for ($k = 0; $k < 4; $k++){ print BEST "$n1[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n1[$j][$k] > $n1[$j][$domnuc]){ $domnuc = $k; } } if ($n1[$j][$domnuc] >= 0.75*$A){ print BEST "$residues1[$domnuc]"; } if ($n1[$j][$domnuc] < 0.75*$A){ print BEST "$residues2[$domnuc]"; } print BEST "\n"; } print BEST "Block 2: \n"; for ($j = 0; $j < $w2; $j++){ print BEST "$j\t"; for ($k = 0; $k < 4; $k++){ print BEST "$n2[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n2[$j][$k] > $n2[$j][$domnuc]){ $domnuc = $k; } } if ($n2[$j][$domnuc] >= 0.75*$A){ print BEST "$residues1[$domnuc]"; } if ($n2[$j][$domnuc] < 0.75*$A){ print BEST "$residues2[$domnuc]"; } print BEST "\n"; } print BEST "\nBackground: \n"; for ($k = 0; $k < 4; $k++){ printf BEST "%0.3f\t",$theta0[$k]; } print BEST "\n"; print BEST "\n\nSites:\n"; for ($gene = 0;$gene < $numseq;$gene++){ for ($pos = 0;$pos < $numpos[$gene];$pos++){ if ($Aind[$gene][$pos] > 0){ $gap = $Aind[$gene][$pos]; $start1 = $pos + 1; # indexing from one to numpos now $start2 = $pos + $w1 + $gap + 1; # indexing from one to numpos now if ($strand[$gene] eq "f"){ print BEST "$genenames[$gene]\tf\t$start1\t$gap\t$start2\t"; } if ($strand[$gene] eq "r"){ print BEST "$genenames[$gene-1]\tr\t$start1\t$gap\t$start2\t"; } for ($j = 0; $j < $w1; $j++){ $nuc = $seq[$gene][$pos+$j]; print BEST "$residues1[$nuc]" } print BEST "_"; for ($j = 0; $j < $w2; $j++){ $nuc = $seq[$gene][$pos+$w1+$gap+$j]; print BEST "$residues1[$nuc]" } print BEST "\n"; } } } print BEST "\n\n"; } } # end of loop over motifs ####################### Subroutines !!!!!!!! ############################### sub sum{ my($i); my($total); my($N); my(@x); @x = @_; $N=@x; $total=0; for($i = 0;$i < $N; $i++){ $total = $total + $x[$i]; } return $total; } sub calculateBind{ my($i); my($j); my($m); my($gap); my(@Bind); for ($i = 0; $i < $numseq; $i++){ for ($j = 0; $j < $numpos[$i]; $j++){ $Bind[$i][$j] = 1; } } for ($i = 0; $i < $numseq; $i++){ for ($j = 0; $j < $numpos[$i]; $j++){ if ($Aind[$i][$j] > 0){ $gap = $Aind[$i][$j]; for ($m = 0; $m < $w1; $m++){ $Bind[$i][$j+$m] = 0; } for ($m = 0; $m < $w2; $m++){ $Bind[$i][$j+$w1+$gap+$m] = 0; } } } } return @Bind; } sub calculatebgscore{ my($x); my($j); my($k); my($nuc); my(@n0); my(@Bind); for ($k = 0; $k < 4; $k++){ $n0[$k] = 0; } @Bind = calculateBind(); for ($j = 0; $j < $numseq; $j++){ for ($k = 0; $k < $numpos[$j]; $k++){ if ($Bind[$j][$k] == 1){ $nuc = $seq[$j][$k]; $n0[$nuc] = $n0[$nuc] + 1; } } } $x = 0; for ($k = 0; $k < 4; $k++){ $x = $x + $n0[$k]*log($theta0[$k]); } return $x; } sub calculateblock1{ my(@n1); my($i); my($nuc); my($j); my($k); for ($j = 0; $j < $w1; $j++){ for ($k = 0; $k < 4; $k++){ $n1[$j][$k] = 0; } } for ($i = 0; $i < $numseq; $i++){ for ($j = 0; $j < $numpos[$i]; $j++){ if ($Aind[$i][$j] > 0){ for ($k = 0; $k < $w1; $k++){ $nuc = $seq[$i][$j+$k]; $n1[$k][$nuc] = $n1[$k][$nuc] + 1; } } } } return @n1; } sub calculateblock2{ my(@n2); my($gap); my($nuc); my($i); my($j); my($k); for ($j = 0; $j < $w2; $j++){ for ($k = 0; $k < 4; $k++){ $n2[$j][$k] = 0; } } for ($i = 0; $i < $numseq; $i++){ for ($j = 0; $j < $numpos[$i]; $j++){ if ($Aind[$i][$j] > 0){ $gap = $Aind[$i][$j]; for ($k = 0; $k < $w2; $k++){ $nuc = $seq[$i][$j+$w1+$gap+$k]; $n2[$k][$nuc] = $n2[$k][$nuc] + 1; } } } } return @n2; } sub truescore{ my($Nstar); my($pw); my($pA); my($pbg); my($pm1); my($pm2); my(@n1); my(@n2); my($x); my($j); my($k); $Nstar = $sumN - ($w1+$w2+$maxgap-1)*$numseq; $pw = $w1*log($w10)-gammaln($w1+1)+$w2*log($w20)-gammaln($w2+1); # p(w) terms $pA = gammaln($A+1) + gammaln($Nstar-$A+1) - gammaln($Nstar+2); # p(A) terms $pbg = calculatebgscore(); # p(bg) terms $pm1 = ($w1+$w2)*gammaln(4*$b) - ($w1+$w2)*gammaln($A + 4*$b); # p(theta) terms @n1 = calculateblock1(); @n2 = calculateblock2(); $pm2 = 0; for ($j = 0; $j < $w1; $j++){ for ($k = 0; $k < 4; $k++){ $pm2 = $pm2 + gammaln($n1[$j][$k] + $b) - gammaln($b); } } for ($j = 0; $j < $w2; $j++){ for ($k = 0; $k < 4; $k++){ $pm2 = $pm2 + gammaln($n2[$j][$k] + $b) - gammaln($b); } } $x = $pw + $pA + $pbg + $pm1 + $pm2; #print "pw= $pw pA= $pA pbg= $pbg pm1= $pm1 pm2= $pm2 score= $x\n"; return $x; } sub nullscore{ my($Nstar); my($pw); my($pA); my($pbg); my($x); my($j); my($k); $Nstar = $sumN - ($w1+$w2+$maxgap-1)*$numseq; $pw = $w1*log($w10)-gammaln($w1+1)+$w2*log($w20)-gammaln($w2+1); # p(w) terms $pA = gammaln(1) + gammaln($Nstar+1) - gammaln($Nstar+2); # p(A) terms $pbg = 0; # p(bg) terms for ($k = 0; $k < 4; $k++){ $pbg = $pbg + $N[$k]*log($theta0[$k]); } $x = $pw + $pA + $pbg; return $x; }