#!/usr/bin/perl #!/usr/bin/perl use Math::SpecFun::Gamma qw(gamma gammaln); print "\nBioOptimizer, Version 2.0, is copyrighted to Shane T. Jensen (2004)\n"; print "Reference:\n"; print "JENSEN, S.T. and LIU, J.S. (2004). BioOptimizer: a Bayesian scoring function approach to motif discovery. Accepted for publication in Bioinformatics.\n"; @residues1 = (A,C,G,T); @residues2 = (a,c,g,t); #### Fixed Parameters $pi=atan2(1,1)*4; $const = sqrt(2*$pi); $b = 1; # prior for motif counts #### Input Parameters $inseq = $ARGV[0]; $inbiop = $ARGV[1]; $nummotif = $ARGV[2]; $rc = $ARGV[3]; $w0 = $ARGV[4]; $filename = "$inbiop.opt"; $bestoverallscore = 0; open (SUM,">$filename.sum"); print SUM "Optimization with Poisson Prior with expectation = $w0\n"; print SUM "A good motif should have a final score > null score\n"; print SUM "motif\tinit_score\tw\tA\tconsensus\t\tfinal_score\t\tw\tA\tconsensus\n"; ####################### Reading in Upstream Regions ######################### @genenames = (); @seqs = (); @seq = (); open (SEQ,"$inseq"); while (){ @line1 = split(//,$_); @line2 = split(/\s+/,$_); if ($line1[0] eq ">"){ push (@genenames,$line2[0]); push (@strand,"f"); if ($rc == 1){ push (@genenames,"$line2[0].r"); # reverse complement push (@strand,"r"); } } if ($line1[0] ne ">"){ chomp $_; push (@seqs,$_); if ($rc == 1){ $revcomp = reverse($_); # reverse complement $revcomp =~ tr/acgt/tgca/; $revcomp =~ tr/ACGT/TGCA/; push (@seqs,$revcomp); } } } @temp = split (//,$seqs[0]); $numgene = @genenames; $numseq = @seqs; if ($numseq != $numgene){ print "Bad Sequence File! Not the same number of gene names as upstream regions \n"; die; } ####### ACGT --> 0123, counting upstream positions and nucs ############ @numpos = (); $A = 0; $totalnumpos = 0; for ($i = 0;$i < $numseq;$i++){ $seqs[$i] =~ tr /acgtn/01235/; $seqs[$i] =~ tr /ACGTN/01235/; @line2 = split(//,$seqs[$i]); $numpos[$i] = @line2; $totalnumpos = $totalnumpos + $numpos[$i]; for ($j = 0;$j < $numpos[$i];$j++){ $nuc = $line2[$j]; $seq[$i][$j] = $nuc; $N[$nuc] = $N[$nuc] + 1; $Aind[$i][$j] = 0; } } $sumN = $N[0]+$N[1]+$N[2]+$N[3]; print "$inseq has $numseq sequences with $sumN valid and $N[5] invalid nucleotides\n"; print "Sequences and Number of Positions:\n"; print "@genenames\n@numpos\n\n"; ####################### Start of Loop over Motif ################################ for ($motif = 1; $motif <= $nummotif; $motif++){ $curmotifid = "#$motif:"; $temp = $motif + 1; $nextmotifid = "#$temp:"; print "\nOptimization for Motif $curmotifid\n"; ###################### Initializing Aind and A ################################### $A = 0; for ($i = 0;$i < $numseq;$i++){ for ($j = 0;$j < $numpos[$i];$j++){ $Aind[$i][$j] = 0; if ($seq[$i][$j] == 5){ $Aind[$i][$j] = 2; } } } ####################### Reading in bioprospector output ######################### open (MOTIF,"$inbiop"); $inmotif = 0; $outmotif = 0; while (){ @line = split(/\s+/,$_); @line2 = split(//,$_); $numitems = @line; if ($line[0] eq "Motif" and $line[1] eq $curmotifid){ $inmotif = 1; } if ($inmotif == 1 and $outmotif == 0){ if ($line[0] eq "Width"){ ### getting motif width $w = $line[1]; $w =~ tr /\(//d; $w =~ tr /\,//d; } if ($line2[0] eq ">"){ ### getting motif sites for ($k = 0; $k < $numitems; $k++){ if ($line[$k] eq "site"){ $blk = $line[$k+3]; $len = $line[$k-1]; if ($line[$k+2] eq "f"){ $name = $line[0]; $start = $blk - 1; } if ($line[$k+2] eq "r"){ $name = "$line[0].r"; $start = $len - $blk; } } } for ($i = 0;$i < $numseq;$i++) { if ($name eq $genenames[$i]) { $mask = 0; for ($m = 0; $m < $w; $m++){ if ($seq[$i][$start+$m] == 5){ $mask = 1; } } if ($mask == 0){ $Aind[$i][$start] = 1; $A = $A + 1; print "Site $A\tSeq $i ($genenames[$i])\tPos $start\t"; for ($j = 0; $j < $w; $j++){ $nuc = $seq[$i][$start+$j]; print "$residues1[$nuc]"; } print "\n"; } } } } } if ($line[0] eq "Motif" and $line[1] eq $nextmotifid){ $outmotif = 1; } } ############ Estimating Fixed Background Parameters ############### for ($k = 0; $k < 4; $k++){ $theta0[$k] = 0; } for ($i = 0;$i < $numseq;$i++){ for ($j = 0;$j < $numpos[$i];$j++){ $inbg = 1; for ($m = $j-($w-1); $m <= $j; $m++){ if ($Aind[$i][$m] == 1){ $inbg = 0; } } if ($Aind[$i][$j] == 2){ $inbg = 0; } if ($inbg == 1){ $nuc = $seq[$i][$j]; $theta0[$nuc] = $theta0[$nuc] + 1; } } } $sumtheta0 = sum(@theta0); for ($k = 0; $k < 4; $k++){ $theta0[$k] = $theta0[$k]/$sumtheta0; } print "$inbiop has $A sites with $w columns\n"; print "Sites in $genenames[0]:\n"; for ($j = 0;$j < $numpos[0];$j++){ print "$Aind[0][$j]"; } print "\n"; print "Sites in $genenames[$numseq-1]:\n"; for ($j = 0;$j < $numpos[$numseq-1];$j++){ print "$Aind[$numseq-1][$j]"; } print "\n"; @n = calculatemotif(); print "Motif Counts:\n"; for ($j = 0; $j < $w; $j++){ print "$j\t"; for ($k=0;$k<4;$k++){ print "$n[$j][$k]\t"; } print "\n"; } print "Background Parameters:\n"; for ($k = 0;$k < 4;$k++){ print "$theta0[$k]\t"; } print "\n"; close (MOTIF); ###################### Begin Iterations !! ################################## $score = truescore(); @n = calculatemotif(); $initscore = $score; print "Initial Score: $initscore\n"; print SUM "$motif\t"; printf SUM "%0.2f\t",$initscore; print SUM "$w\t$A\t"; for ($j = 0; $j < $w; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n[$j][$k] > $n[$j][$domnuc]){ $domnuc = $k; } } if ($n[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "\t"; $changemade = 1; $iters = 0; while ($changemade == 1){ $changemade = 0; $iters = $iters + 1; @n = calculatemotif(); for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]-$w; $pos++){ $change = 0; ### backup current score $oldscore = $score; $Nstar = $sumN - ($w-1)*$numseq; ### decide whether to add or remove $add = 0; if ($Aind[$gene][$pos] == 0){ $add = 1; } $masked = 0; for ($m = 0; $m < $w; $m++){ if ($Aind[$gene][$pos + $m] == 2){ $masked = 1; } } ### calculate difference for adding if ($add == 1 and $masked == 0){ @Bind = (); for ($j = 0; $j < $w; $j++){ $Bind[$j] = 1; $curpos = $pos + $j; for ($l = $curpos-($w-1); $l <= $curpos; $l++){ if ($Aind[$gene][$l] == 1){ $Bind[$j] = 0; } } } $diff = log(($A+1)/($Nstar-$A)); for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; $diff = $diff + log(($n[$j][$nuc]+$b)/($A+4*$b)); if ($Bind[$j] == 1){ $diff = $diff - log($theta0[$nuc]); } } if ($diff > 0){ $A = $A + 1; $Aind[$gene][$pos] = 1; $change = 1; $changemade = 1; for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; $n[$j][$nuc] = $n[$j][$nuc] + 1; } } } ### calculate difference for subtracting if ($add == 0 and $masked == 0){ @Bind = (); for ($j = 0; $j < $w; $j++){ $Bind[$j] = 1; $curpos = $pos + $j; for ($l = $curpos-($w-1); $l <= $curpos; $l++){ if ($Aind[$gene][$l] == 1 and $l != $pos){ $Bind[$j] = 0; } } } $diff = log(($Nstar-$A+1)/$A); for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; if ($nuc == 5){ print "Died trying to add a site\n"; print "$gene\t$pos\t$j\t$add\t$masked\n"; die; } $diff = $diff - log(($n[$j][$nuc]+$b-1)/($A+4*$b-1)); if ($Bind[$j] == 1){ $diff = $diff + log($theta0[$nuc]); } } if ($diff > 0){ $A = $A - 1; $Aind[$gene][$pos] = 0; $change = 1; $changemade = 1; for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; $n[$j][$nuc] = $n[$j][$nuc] - 1; } } } ### outputting iteration result #print "$motif\t$iters\t$gene\t$pos\t$add\t$change\t"; #printf "%0.4f\t%0.4f\t",$score,$diff; #@n = calculatemotif(); #@firstrow = @{$n[0]}; #print "$A\t$w\t\t@firstrow\n"; } # end of loop over positions } # end of loop over genes $score = truescore(); print "Finished scan $iters of sequences |A| = $A \n"; ###### Find best change in motif width ####### ### first backup everything $oldscore = $score; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $oldAind[$gene][$pos] = $Aind[$gene][$pos] } } $oldA = $A; $oldw = $w; $bestscore = $score; @bestaction = (); for ($increase = 0; $increase < 2; $increase++){ for ($front = 0; $front < 2; $front++){ ### initialize motif and background and Aind and w $A = 0; $w = $oldw; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } ### possibly removing column ### if ($increase == 0){ $w = $w - 1; if ($front == 1){ # if removing from front, shift sites forward by one for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; $Aind[$gene][$pos+1] = 1; $A = $A + 1; } } } } if ($front == 0){ # if removing from back, don't have to change Aind for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 1; $A = $A + 1; } } } } } ### possibly adding a column ### if ($increase == 1){ $w = $w + 1; if ($front == 1){ for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; if ($pos > 0 and $seq[$gene][$pos-1] != 5){ ## make sure don't expand beyond start of seq or masked site $Aind[$gene][$pos-1] = 1; ## shift start of sites back by one $A = $A + 1; } } } } } if ($front == 0){ for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; if ($pos < $numpos[$gene] - $w - 1 and $seq[$gene][$pos+$w-1] != 5){ ## don't expand past seq end or masked $Aind[$gene][$pos] = 1; $A = $A + 1; } } } } } } $newscore = truescore(); if ($newscore > $bestscore){ $bestaction[0] = $increase; $bestaction[1] = $front; $bestscore = $newscore; } #print "oldscore = $oldscore A = $A increase = $increase front = $front newscore = $newscore bestscore = $bestscore \n"; } } #### Not Making a Width change if ($oldscore >= $bestscore){ $w = $oldw; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos]; } } $A = $oldA; $score = $oldscore; } #### Making a Width Change if ($oldscore < $bestscore){ $changemade = 1; $w = $oldw; $A = 0; for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ $Aind[$gene][$pos] = $oldAind[$gene][$pos] } } $increase = $bestaction[0]; $front = $bestaction[1]; if ($increase == 0){ $w = $w - 1; if ($front == 1){ # if removing from front, shift sites forward by one for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; $Aind[$gene][$pos+1] = 1; $A = $A + 1; } } } } if ($front == 0){ # if removing from back, don't have to change Aind for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 1; $A = $A + 1; } } } } } if ($increase == 1){ $w = $w + 1; if ($front == 1){ for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; if ($pos > 0 and $seq[$gene][$pos-1] != 5){ ## make sure don't expand beyond start of seq $Aind[$gene][$pos-1] = 1; ## shift start of sites back by one $A = $A + 1; } } } } } if ($front == 0){ for ($gene = 0; $gene < $numseq; $gene++){ for ($pos = 0; $pos < $numpos[$gene]; $pos++){ if ($oldAind[$gene][$pos] == 1){ $Aind[$gene][$pos] = 0; if ($pos < $numpos[$gene] - $w - 1 and $seq[$gene][$pos+$w-1] != 5){ ## make sure don't expand end start of seq $Aind[$gene][$pos] = 1; $A = $A + 1; } } } } } } $score = truescore(); } #print "$motif\twidth change increase = $increase front = $front\t"; #printf "%0.4f\t%0.4f\t",$oldscore,$score; @n = calculatemotif(); #@firstrow = @{$n[0]}; #print "$A\t$w\t\t@firstrow\n"; print "Finished attempt $iters to change motif width: new w = $w \n"; } # end of iterations $nullscore = nullscore(); @n = calculatemotif(); ####################### Opening output files ############################### if ($motif == 1){ open (OUT,">$filename.all"); } if ($motif > 1){ open (OUT,">>$filename.all"); } print OUT "infile: $inbiop motif: $motif numiters:$iters \n"; print OUT "Initial Score = $initscore Final Score = $score \t A = $A \t w = $w\n"; print OUT "The null Score (no sites) for this motif width is $nullscore. A good motif should have a final score > null score\n\n"; print OUT "Motif: \n"; for ($j = 0; $j < $w; $j++){ print OUT "$j\t"; for ($k = 0; $k < 4; $k++){ print OUT "$n[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n[$j][$k] > $n[$j][$domnuc]){ $domnuc = $k; } } if ($n[$j][$domnuc] >= 0.75*$A){ print OUT "$residues1[$domnuc]"; } if ($n[$j][$domnuc] < 0.75*$A){ print OUT "$residues2[$domnuc]"; } print OUT "\n"; } print OUT "\nBackground: \n"; for ($k = 0; $k < 4; $k++){ printf OUT "%0.3f\t",$theta0[$k]; } print OUT "\n"; print OUT "\n\nSites:\n"; for ($gene = 0;$gene < $numseq;$gene++){ for ($pos = 0;$pos < $numpos[$gene];$pos++){ if ($Aind[$gene][$pos] == 1){ $temp = $pos + 1; # indexing from one to numpos now if ($strand[$gene] eq "f"){ print OUT "$genenames[$gene]\tf\t$temp\t"; } if ($strand[$gene] eq "r"){ print OUT "$genenames[$gene-1]\tr\t$temp\t"; } for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; print OUT "$residues1[$nuc]" } print OUT "\n"; } } } print OUT "\n\n"; printf SUM "%0.2f (null= %0.2f)\t",$score,$nullscore; print SUM "$w\t$A\t"; for ($j = 0; $j < $w; $j++){ $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n[$j][$k] > $n[$j][$domnuc]){ $domnuc = $k; } } if ($n[$j][$domnuc] >= 0.75*$A){ print SUM "$residues1[$domnuc]"; } if ($n[$j][$domnuc] < 0.75*$A){ print SUM "$residues2[$domnuc]"; } } print SUM "\n"; ####################### Opening best optimization result file ############### if ($motif == 1 or $score > $bestoverallscore){ $bestoverallscore = $score; open (BEST,">$filename.best"); print BEST "infile: $inbiop motif: $motif numiters:$iters \n"; print BEST "Initial Score = $initscore Final Score = $score \t A = $A \t w = $w \n"; print BEST "The null Score (no sites) for this motif width is $nullscore. A good motif should have a final score > null score\n\n"; print BEST "Motif: \n"; for ($j = 0; $j < $w; $j++){ print BEST "$j\t"; for ($k = 0; $k < 4; $k++){ print BEST "$n[$j][$k]\t"; } $domnuc = 0; for ($k = 1; $k < 4; $k++){ if ($n[$j][$k] > $n[$j][$domnuc]){ $domnuc = $k; } } if ($n[$j][$domnuc] >= 0.75*$A){ print BEST "$residues1[$domnuc]"; } if ($n[$j][$domnuc] < 0.75*$A){ print BEST "$residues2[$domnuc]"; } print BEST "\n"; } print BEST "\nBackground: \n"; for ($k = 0; $k < 4; $k++){ printf BEST "%0.3f\t",$theta0[$k]; } print BEST "\n"; print BEST "\n\nSites:\n"; for ($gene = 0;$gene < $numseq;$gene++){ for ($pos = 0;$pos < $numpos[$gene];$pos++){ if ($Aind[$gene][$pos] == 1){ $temp = $pos + 1; # indexing from one to numpos now if ($strand[$gene] eq "f"){ print BEST "$genenames[$gene]\tf\t$temp\t"; } if ($strand[$gene] eq "r"){ print BEST "$genenames[$gene-1]\tr\t$temp\t"; } for ($j = 0; $j < $w; $j++){ $nuc = $seq[$gene][$pos+$j]; print BEST "$residues1[$nuc]" } print BEST "\n"; } } } print BEST "\n\n"; } } # end of loop over motifs ####################### Subroutines !!!!!!!! ############################### sub sum{ my($i); my($total); my($N); my(@x); @x = @_; $N=@x; $total=0; for($i = 0;$i < $N; $i++){ $total = $total + $x[$i]; } return $total; } sub calculatebgscore{ my($x); my($j); my($k); my(@n0); for ($k = 0; $k < 4; $k++){ $n0[$k] = 0; } for ($j = 0; $j < $numseq; $j++){ for ($k = 0; $k < $numpos[$j]; $k++){ $nuc = $seq[$j][$k]; $inbg = 1; for ($m = $k-($w-1); $m <= $k; $m++){ if ($Aind[$j][$m] == 1){ $inbg = 0; } } if ($inbg == 1){ $n0[$nuc] = $n0[$nuc] + 1; } } } $x = 0; for ($k = 0; $k < 4; $k++){ $x = $x + $n0[$k]*log($theta0[$k]); } return $x; } sub calculatemotif{ my(@n); my($i); my($j); my($k); for ($j = 0; $j < $w; $j++){ for ($k = 0; $k < 4; $k++){ $n[$j][$k] = 0; } } for ($i = 0; $i < $numseq; $i++){ for ($j = 0; $j < $numpos[$i]; $j++){ if ($Aind[$i][$j] == 1){ for ($k = 0; $k < $w; $k++){ $nuc = $seq[$i][$j+$k]; $n[$k][$nuc] = $n[$k][$nuc] + 1; } } } } return @n; } sub truescore{ my($Nstar); my($pw); my($pA); my($pbg); my($pm1); my($pm2); my(@n); my($x); my($j); my($k); $Nstar = $sumN - ($w-1)*$numseq; $pw = $w*log($w0)-gammaln($w+1); # p(w) terms $pA = gammaln($A+1) + gammaln($Nstar-$A+1) - gammaln($Nstar+2); # p(A) terms $pbg = calculatebgscore(); # p(bg) terms $pm1 = $w*gammaln(4*$b) - $w*gammaln($A + 4*$b); # p(theta) terms @n = calculatemotif(); $pm2 = 0; for ($j = 0; $j < $w; $j++){ for ($k = 0; $k < 4; $k++){ $pm2 = $pm2 + gammaln($n[$j][$k] + $b) - gammaln($b); } } $x = $pw + $pA + $pbg + $pm1 + $pm2; #print "pw= $pw pA= $pA pbg= $pbg pm1= $pm1 pm2= $pm2 score= $x\n"; return $x; } sub nullscore{ my($Nstar); my($pw); my($pA); my($pbg); my($x); my($k); $Nstar = $sumN - ($w-1)*$numseq; $pw = $w*log($w0)-gammaln($w+1); # p(w) terms $pA = gammaln(1) + gammaln($Nstar+1) - gammaln($Nstar+2); # p(A) terms $pbg = 0; # p(bg) terms for ($k = 0; $k < 4; $k++){ $pbg = $pbg + $N[$k]*log($theta0[$k]); } $x = $pw + $pA + $pbg; #print "pw= $pw pA= $pA pbg= $pbg pm1= $pm1 pm2= $pm2 score= $x\n"; return $x; }