#! /usr/bin/perl use strict; use warnings; use Getopt::Long; use Bio::SearchIO; use Bio::SeqIO; ########################################################################################################### # This program detects pili-operons from a list of protein or dna sequences in fasta format. # Detection is performed in 11 steps: # 1. If input alphabet is DNA we start by preprocessing DNA seqs and extracting ORFs from DNA contigs. # 2. The extracted ORFs or protein sequences are preprocessed by deleting nonIUPAC chars from seq endpoints # and replacing these chars by X within the sequences. Sequence annotations are saved and each seq is labeled # by its running number. # 3. Running hmmsearch with a set of hmm models on the input sequences. # Searches are performed one HMM at a time parsing the results after each search. # 4. Selecting the HMM with the highest score for each input sequence. If all HMMs # return negative scores, null model is selected. Null model is defined to have score 0 # and e-value equal to the total number of sequences. # 5. Clustering sequences with non-null models into segments according to the "gap" threshold. # Segments are defined by two numbers: running numbers of the leftmost and the rightmost sequences. # 6. Calculating scores and p-values for segments. Segment scores are calculated by simply summing scores for # sequences, p-values are calculated using bootstrap2 unitility (Fisher's exact test + MonteCarlo simulation). # 7. Thresholding segments using cut-off values on segment score, segment P/P_adj-value and segment length. # 8. Selecting top level segments. # 9. Running Phobius on seqs that passed thresholding # 10. Sorting results by the running number of segments first sequence, segment score or segment P/P_adj-value # 11. Printing output as tab-delimited text or HTML. # # Dependencies: # Perl modules # Getopt::Long; # Bio::SeqIO # Bio::SearchIO # files # $models*.hmm # codon_tables_ncbi.txt # executables # hmmsearch # bootstrap2 # phobius.pl # get_orf_from_contig.pl # ########################################################################################################### # Configurations: # path to the hmmsearch executable my $hmmsearch= "hmmsearch"; my $phobius= "phobius"; my $get_orf= "./get_orf_from_contig_v2.pm"; my $hypgeom= "./bootstrap/bootstrap2"; # path to the c++ hypergeometric P-value estimator # Paths to temporary files. Note that process executing this program should have write access to these files. my $temp_fasta= "./temp.faa"; my $buffer1= "./buffer1"; my $buffer2= "./buffer2"; # path to the directory containing hmm models my $models= "./models/"; # Help message my $helpm="use: locp.pl [options] filein1 [filein2..]\n". "filein\tinput file in fasta format\n". "options\n". "\t-A value\tsequence alphabet, dna or prot (default)\n". "\t-T value\tminimum open reading frame size (default=50)\n". "\t-S value\tscore cutoff (default 0)\n". "\t-P value\tP-value cutoff (default 1)\n". "\t-P_adj value\tP_adj cufoff (default 0.05)\n". "\t-L value\toperon length cutoff (default 0)\n". "\t-C\t\tanalyse one contigue at a time\n". "\t-V dest\tverbal: prints progress messages to dest (off by default), where dest can be STDOUT or filename\n". "\t-mgap value\tmaximum gap allowed in segments (default 5)\n". "\t-sort key\tsorting results by key: i_begin(default), score, P and P_adj\n". "\t-print mode\tprintout mode: text(default) or html\n". "\t-fileout\tprint to file(s) named by appending \".out\" or \"_out.html\" to input file name(s)\n"; # Default arguments my %opt= ('A'=>'prot', 'T'=>50, 'S'=>0, 'P'=>1, 'P_adj' =>0.05, 'L'=>0, 'C'=> !1, 'V'=> !1, 'maxgap' => 5, 'sort' => 'i_begin', 'print'=> 'text', 'fileout'=>!1); # Reading command line arguments my $web= !1; GetOptions('A=s' => \$opt{'A'}, 'T=i' => \$opt{'T'}, 'S=f' => \$opt{'S'}, 'P=f' => \$opt{'P'}, 'P_adj=f' => \$opt{'P_adj'}, 'L=i' => \$opt{'L'}, 'C' => \$opt{'C'}, 'V=s' => \$opt{'V'}, 'mgap=i' => \$opt{'maxgap'}, 'sort=s' => \$opt{'sort'}, 'print=s' => \$opt{'print'}, 'fileout' => \$opt{'fileout'}, 'web' => \$web); my $IUPAC_prot= 'ARNDCQEGHILKMFPSTWYVX'; my $IUPAC_dna= 'ACGTRYKMSWBDHVN'; # creating handles for log print my $LOGOUT; if($opt{'V'}){ if($opt{'V'} eq 'STDOUT'){ $LOGOUT= \*STDOUT; } else{ open(LOUT,">".$opt{'V'}) or die "Can\'t open file for logging: $opt{'V'}: $!\n"; $LOGOUT= \*LOUT; } } # Checking comamnd line arguments if( !($opt{'sort'} eq 'i_begin' || $opt{'sort'} eq 'score' || $opt{'sort'} eq 'P' || $opt{'sort'} eq 'P_adj') ){ $opt{'sort'}= 'i_begin'; } if( !($opt{'A'} eq 'dna' || $opt{'A'} eq 'prot') ){ $opt{'A'}= 'prot'; } if ( @ARGV == 0 ) { print $helpm; exit 1; } # when printing to STD we append html opening tags before data tables and closing tags after data tables if( ($opt{'print'} eq 'html') && !$opt{'fileout'} ){ print ""; } my $file_count=0; for my $fin(@ARGV){ $opt{'fin'}= $fin; if($opt{'V'}){print $LOGOUT (++$file_count), " analysing ",$opt{'fin'},"\n";} main(\%opt,$web); } if( ($opt{'print'} eq 'html') && !$opt{'fileout'} ){ print ""; } sub main{ my %opt= %{shift()}; my $web= shift(); my $line; my $command; my $nNkK= !1; # HANDLING MAC OS LINE CHANGES open(IN, "<$opt{'fin'}") or die "Can\'t open file $opt{'fin'}: $!\n"; open(OUT, ">$temp_fasta"); while($line=){ $line =~ s/[\r]+/\n/g; print OUT $line; } close(IN);close(OUT); # (1) deleteing/replacing non IUPAC chars from DNA seq (ADD ON FOR METAGENOME ANALYSIS) if($opt{'A'} eq 'dna'){ if($opt{'V'}){print $LOGOUT "# preprocessing dna sequence..";} my $in = Bio::SeqIO->new(-file => "<".$temp_fasta, '-format' => 'Fasta'); my $out = Bio::SeqIO->new(-file => '>'.$buffer1, '-format' => 'Fasta'); while( my $seq = $in->next_seq() ){ # deleting/replacing nonIUPAC my $seqstr= $seq->seq(); $seqstr =~ s/^[^$IUPAC_dna]+//g; $seqstr =~ s/[^$IUPAC_dna]+$//g; $seqstr =~ s/[^$IUPAC_dna]/N/g; my $newseq= Bio::Seq->new( -display_id => $seq->display_id(), -description => $seq->description(), -seq => $seqstr); $out->write_seq($newseq); } if($opt{'V'}){print $LOGOUT "done\n";} $command= "cp -f $buffer1 $temp_fasta"; system($command)==0 or die "exception while running $command: EXITING\n"; } # (1) Converting DNA to ORFs (ADD ON FOR METAGENOME ANALYSIS) if($opt{'A'} eq 'dna'){ if($opt{'V'}){print $LOGOUT "# converting DNA to open reading frames..";} $command= "$get_orf -L $opt{'T'} $temp_fasta > $buffer1"; system($command) ==0 or die "exception while running $command: EXITING\n"; if($opt{'V'}){print $LOGOUT "done\n";} $command= "cp -f $buffer1 $temp_fasta"; system($command)==0 or die "exception while running $command: EXITING\n"; } # (2) Saving annotation, adding running number and deleting/replacing non IUPAC chars my @seqids; my @seqannotations; if($opt{'V'}){print $LOGOUT "# preprocessing protein sequence..";} my $in = Bio::SeqIO->new(-file => '<'.$temp_fasta, '-format' => 'Fasta'); my $out = Bio::SeqIO->new(-file => '>'.$buffer1, '-format' => 'Fasta'); my $seqcount=0; while( my $seq = $in->next_seq() ){ # saving annotation push(@seqids,$seq->display_id()); push(@seqannotations, $seq->description()); # deleting/replacing nonIUPAC: would crash phobius my $seqstr= $seq->seq(); $seqstr =~ s/^[^$IUPAC_prot]+//g; $seqstr =~ s/[^$IUPAC_prot]+$//g; $seqstr =~ s/[^$IUPAC_prot]/X/g; my $newseq= Bio::Seq->new( -display_id => $seq->display_id(), -description => $seq->description(), -seq => $seqstr); # adding running number $newseq->display_id($seqcount++."|".$seq->display_id()); $out->write_seq($newseq); } $command= "cp -f $buffer1 $temp_fasta"; system($command)==0 or die "exception while running $command: EXITING\n"; if($opt{'V'}){print $LOGOUT "done\n";} # (3) Running and parsing HMM models if($opt{'V'}){print $LOGOUT "# running HMMs:\n";} my @models= <$models*.hmm>; my $modelcount= $#models+2; # HMM models plus null-model my $options= "-Z $seqcount -E $seqcount"; my $temphmm= "temp.hmm"; my $hit; my $evalue; my $score; my $search; my $result; my @model_names; my @model_an; # allocating matricies for parsed hit-objects, e-values and score-values my @hit_mat; if($opt{'print'} eq 'html'){ $#hit_mat= $seqcount-1; } my @evalue_mat; $#evalue_mat= $seqcount-1; my @score_mat; $#score_mat= $seqcount-1; for(my $seqi=0; $seqi<$seqcount; $seqi++){ if($opt{'print'} eq 'html'){ my @a; $#a= $modelcount-1; $hit_mat[$seqi]= \@a; } my @b; $#b= $modelcount-1; my @c; $#c= $modelcount-1; for(my $i=0; $i<=($modelcount-1); $i++){ $b[$i]= 1; $c[$i]= 0; } $evalue_mat[$seqi]= \@b; $score_mat[$seqi]= \@c; } # null model scores and e-values, these are set to 0 and the total number of sequences respectively push(@model_names,"null"); push(@model_an,"[none]"); for(my $seqi=0; $seqi<$seqcount; $seqi++){ #$hit_mat[$seqi]->[0]= 0; $score_mat[$seqi]->[0]= 0; $evalue_mat[$seqi]->[0]= $seqcount; } # running hmmsearch and parsing: parsed HMM model hit-objects, scores and e-values for(my $mi=0; $mi<=$#models; $mi++){ if($opt{'V'}){print $LOGOUT "# running $models[$mi]..";} system("$hmmsearch $options $models[$mi] $temp_fasta > $buffer1") ==0 or die "exception while running hmmsearch: EXITING\n"; $search = new Bio::SearchIO(-format => 'hmmer', -file => "$buffer1"); $result = $search->next_result(); push(@model_names,$result->query_name()); push(@model_an,$result->query_accession()); while($hit = $result->next_hit()){ my $id= $hit->name(); my $seqi= (split(/[|]/,$id))[0]; if($opt{'print'} eq 'html'){ $hit_mat[$seqi]->[$mi+1]= $hit; } $score_mat[$seqi]->[$mi+1]= $hit->score(); $evalue_mat[$seqi]->[$mi+1]= $hit->significance(); } if($opt{'V'}){print $LOGOUT "done\n";} } # (4) Sorting hmm:s by bitscore and selecting best hmm for each sequence my @mis; # indices of best models my @hmmhits; # binary vector marking seq with score above 0 (modeled with HMM) my @scores; # scores for the best models my @evalues; # e-values for the best models my @pvalues; # p-values for the best models for(my $seqi=0; $seqi<$seqcount; $seqi++){ my $mi= maxi($score_mat[$seqi]); push(@mis,$mi); push(@hmmhits,($mi==0)? !1 : 1); push(@scores, $score_mat[$seqi]->[$mi]); push(@evalues, $evalue_mat[$seqi]->[$mi]); push(@pvalues, ($evalue_mat[$seqi]->[$mi])/$seqcount); } # (5) Scanning for HMM clusters (==segments) with various maximum gap sizes if($opt{'V'}){print $LOGOUT "# scanning for HMM clusters..";} my @segms; if($opt{'C'}){ my @contigs= getContigs(\@seqids); my @hmmhits_c; # binary array of hits for one contig my @segms_cand; for my $contig(@contigs){ @hmmhits_c= subarray(\@hmmhits,$contig->[0],$contig->[1]); my %segms_found; for(my $gap=0; $gap<=$opt{'maxgap'}; $gap++){ my @segms_cand= get_segments(\@hmmhits_c,$gap); for(my $i=0; $i<=$#segms_cand; $i++){ my $i_begin= ($segms_cand[$i]->[0])+($contig->[0]); my $i_end= ($segms_cand[$i]->[1])+($contig->[0]); if( !defined($segms_found{$i_begin}) ){ my %segm= ('i_begin'=>$i_begin, 'i_end'=> $i_end); push(@segms,\%segm); my %temp= ($i_end => 1); $segms_found{$i_begin}= \%temp; } elsif( !defined($segms_found{$i_begin}->{$i_end}) ){ my %segm= ('i_begin'=>$i_begin, 'i_end'=> $i_end); push(@segms,\%segm); $segms_found{$i_begin}->{$i_end}= 1; } } } } } else{ my %segms_found; my @segms_cand; for(my $gap=0; $gap<=$opt{'maxgap'}; $gap++){ my @segms_cand= get_segments(\@hmmhits,$gap); for(my $i=0; $i<=$#segms_cand; $i++){ my $i_begin= $segms_cand[$i]->[0]; my $i_end= $segms_cand[$i]->[1]; if( !defined($segms_found{$i_begin}) ){ my %segm= ('i_begin'=>$i_begin, 'i_end'=> $i_end); push(@segms,\%segm); my %temp= ($i_end => 1); $segms_found{$i_begin}= \%temp; } elsif( !defined($segms_found{$i_begin}->{$i_end}) ){ my %segm= ('i_begin'=>$i_begin, 'i_end'=> $i_end); push(@segms,\%segm); $segms_found{$i_begin}->{$i_end}= 1; } } } } if($opt{'V'}){print $LOGOUT "done\n";} # (6) Segment scores and p-values if($opt{'V'}){print $LOGOUT "# calculating segment scores..";} for my $segm(@segms){ my @temp= subarray(\@scores,$segm->{'i_begin'},$segm->{'i_end'}); my $score= get_sum( \@temp ); $segm->{'score'}= $score; #@temp= subarray(\@pvalues,$segm->{'i_begin'},$segm->{'i_end'}); #my $pvalue= fishers_method(\@temp); #$segm->{'P'}=$pvalue; } if($opt{'V'}){print $LOGOUT "done\n";} # Calling the c++ hypergeometric P-value estimator if($opt{'V'}){print $LOGOUT "# estimating segment P-values..";} my $N= $#seqids+1; my $K= count(\@hmmhits); open(OUT,">".$buffer1) or die "Can\'t open file $buffer1: $!\n"; print OUT "N\t$N\n"; print OUT "K\t$K\n"; print OUT "maxgap\t$opt{'maxgap'}\n"; print OUT "\n"; print OUT "n\tk\n"; for my $segm(@segms){ my $n= $segm->{'i_end'} - $segm->{'i_begin'}+1; my @temp= subarray(\@hmmhits, $segm->{'i_begin'},$segm->{'i_end'}); my $k= count(\@temp); print OUT "$n\t$k\n"; } print OUT "\n"; close(OUT); my $call= "$hypgeom $buffer1 $buffer2"; system($call) ==0 or die "exeption while executing system call: $call\n"; # Reading in results open(IN,"<".$buffer2) or die "Can\'t open file $buffer2: $!\n"; while($line=){ if($line eq "\n"){ last; } } $line= ; chomp($line); my @splitted= split(/[\t]+/,$line); my %index_of; for(my $i=0; $i<=$#splitted; $i++){ if($splitted[$i] eq 'P'){ $index_of{'P'}= $i; }elsif($splitted[$i] eq 'P_adj'){ $index_of{'P_adj'}= $i; } } for my $segm(@segms){ $line=; chomp($line); @splitted= split(/[\t]+/,$line); $segm->{'P'}= $splitted[$index_of{'P'}]; $segm->{'P_adj'}= $splitted[$index_of{'P_adj'}]; } if($opt{'V'}){print $LOGOUT "done\n";} # (7) Thresholding segments @segms= thresholdPLDRs(\@segms,\%opt); # (8) Filtering top level segments @segms= filtertopwin(\@segms); # (9) Running Phobius on seqs that passed threshold my %phobius_results; my %seqi_passed; my $seqid; my $seqi; my $seq_count=0; for my $segm(@segms){ for($seqi=$segm->{'i_begin'}; $seqi<=$segm->{'i_end'}; $seqi++){ $seqi_passed{$seqi}= 1; $seq_count++; } } if($seq_count>0){ my $fin = Bio::SeqIO->new(-file => '<'.$temp_fasta , '-format' => 'Fasta'); my $fout = Bio::SeqIO->new(-file => '>'.$buffer1, '-format' => 'Fasta'); while( my $seq = $fin->next_seq() ){ $seqid= $seq->display_id(); $seqi= (split(/[|]/,$seqid))[0]; if( defined($seqi_passed{$seqi}) ){ $fout->write_seq($seq); } } if($opt{'V'}){print $LOGOUT "# running phobius..";} system("$phobius $buffer1 >& $buffer2") ==0 or die "exception while running phobius: EXITING\n"; %phobius_results= parsePhobius2PointerTable($buffer2); if($opt{'V'}){print $LOGOUT "done\n";} } # (10) Sorting if($opt{'V'}){print $LOGOUT "# sorting..";} my $k= $opt{'sort'}; if($k eq 'score'){ @segms= sort {$b->{$k} <=> $a->{$k}} @segms; # sorting numerically descending } else{ #i_begin, P and P_adj @segms= sort {$a->{$k} <=> $b->{$k}} @segms; # sorting numerically ascending } if($opt{'V'}){print $LOGOUT "done\n";} # (11) Printing my $h=\*STDOUT; if($opt{'fileout'}){ my $fout= $opt{'fin'}. (($opt{'print'} eq 'html')? '_out.html':'_out'); open(OUT,">".$fout) or die "Can\'t open file $fout: $!\n"; $h= \*OUT; } if($opt{'print'} eq 'html'){ if($opt{'fileout'}){ print $h ""; } htmlprint($h, \%opt, \@seqids, \@seqannotations, \@model_names, \@model_an, \@hmmhits, \@score_mat, \@evalue_mat, \@hit_mat, \@segms, \%phobius_results); if($opt{'fileout'}){ print $h ""; } } else{ if($web){ print $h "
\n";
		}
		textprint($h,
				  \%opt,
				  \@seqids,
				  \@seqannotations,
				  \@model_names,
				  \@model_an,
				  \@hmmhits,
				  \@score_mat,
				  \@evalue_mat,
				  \@segms,
				  \%phobius_results);
		if($web){
			print $h "
\n"; } } } sub textprint{ my $h= shift(@_); my %opt= %{shift(@_)}; my @seqids= @{shift(@_)}; my @seqannotations= @{shift(@_)}; my @model_names= @{shift(@_)}; my @model_an= @{shift(@_)}; my @hmmhits= @{shift(@_)}; my @score_mat= @{shift(@_)}; my @evalue_mat= @{shift(@_)}; my @segms= @{shift(@_)}; my %phobius_results= %{shift(@_)}; my @mis; my $mn= $#model_names + 1; for(my $mi=0; $mi< $mn; $mi++){ push(@mis,$mi); } my $N= $#seqids+1; my $K= count(\@hmmhits); print $h getProgDesc(\%opt); print $h "#N= $N\n"; print $h "#K= $K\n"; print $h "\n"; printf $h "OP:\tstart_i\tend_i\tscore\tP\tP_adj\tpilus\n"; printf $h "SEQ:\ti\tid\tHMMnames\tHMMans\tHMMscores\tHHMevalues\tsignalpep\tC-anchor\n\n"; if($#segms<0){ print $h "# NO PILI WERE DETECTED\n"; } my $seqi=0; # sequence index for my $segm(@segms){ # segment info print $h "OP\t", $segm->{'i_begin'},"\t", $segm->{'i_end'},"\t", $segm->{'score'},"\t", $segm->{'P'},"\t", $segm->{'P_adj'},"\t", (($segm->{'pili'})? 'Y':'N'),"\n"; #sequence info for($seqi=$segm->{'i_begin'}; $seqi<=$segm->{'i_end'}; $seqi++){ my @scores= @{$score_mat[$seqi]}; my @evalues= @{$evalue_mat[$seqi]}; my @mis_sorted= sort {$scores[$b] <=> $scores[$a]} @mis; my @phobius_table= @{$phobius_results{$seqi}}; print $h "SEQ\t"; # seq flag print $h $seqi,"\t", # seq ind $seqids[$seqi]; # seq id my $i2= 0; for (my $i=1; $i<=$#mis_sorted; $i++){ if($scores[$mis_sorted[$i]]<=0){ $i2= $i-1; last; } } print $h "\t",$model_names[$mis_sorted[0]]; # seq HMMnames for (my $i=1; $i<=$i2; $i++){ print $h ",",$model_names[$mis_sorted[$i]]; } print $h "\t",$model_an[$mis_sorted[0]]; # seq HMMans for (my $i=1; $i<=$i2; $i++){ print $h ",",$model_an[$mis_sorted[$i]]; } print $h "\t",$scores[$mis_sorted[0]]; # seq scores for (my $i=1; $i<=$i2; $i++){ print $h ",",$scores[$mis_sorted[$i]]; } print $h "\t",$evalues[$mis_sorted[0]]; # seq HMMevalues for (my $i=1; $i<=$i2; $i++){ print $h ",",$evalues[$mis_sorted[$i]]; } print $h "\t", # seq signalpep (($phobius_table[0]->[1] eq 'SIGNAL')?'Y':'N'); print $h "\t", # seq C-anchor (($phobius_table[$#phobius_table-1]->[1] eq 'TRANSMEM') && ($phobius_table[$#phobius_table]->[4] eq 'CYTOPLASMIC'))?'Y':'N'; print $h "\n"; } print $h "\n"; # separates operon entries for readability } } sub htmlprint{ my $h= shift(@_); my %opt= %{shift(@_)}; my @seqids= @{shift(@_)}; my @seqannotations= @{shift(@_)}; my @model_names= @{shift(@_)}; my @model_an= @{shift(@_)}; my @hmmhits= @{shift(@_)}; my @score_mat= @{shift(@_)}; my @evalue_mat= @{shift(@_)}; my @hit_mat= @{shift(@_)}; my @segms= @{shift(@_)}; my %phobius_results= %{shift(@_)}; my @mis; my $mn= $#model_names + 1; for(my $mi=0; $mi< $mn; $mi++){ push(@mis,$mi); } # JavaScipt function and css classes for opening/closing additional sequence information print $h "\n", "\n"; my $col=8; my $N= $#seqids+1; my $K= count(\@hmmhits); print $h "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n"; print $h "\n"; my $seqi=0; # sequence index my $hsp; # high scoring pair object my $first_segm= 1; if($#segms<0){ print $h "\n"; } for my $segm(@segms){ if($first_segm){ $first_segm= !1; } else{ # empty line between consecutive operons print $h "\n"; } for($seqi=$segm->{'i_begin'}; $seqi<=$segm->{'i_end'}; $seqi++){ my @scores= @{$score_mat[$seqi]}; my @evalues= @{$evalue_mat[$seqi]}; my @hits= @{$hit_mat[$seqi]}; my @mis_sorted= sort {$scores[$b] <=> $scores[$a]} @mis; my @phobius_table= @{$phobius_results{$seqi}}; # basic info, visible by default print $h "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n"; # additional info, by default hidden print $h "\n", " \n", "\n"; } } print $h "\n", "
\n", "LOCP parameters", "
", getProgDesc(\%opt),"\n", "
#N= $N\n", "
#K= $K\n", "
Ind\n", "SeqIDTopHMMNameSignalpPepC-anchorScorePP_adj
Ind\n", "SeqIDTopHMMNameSignalpPepC-anchorScorePP_adj

NO PILI WERE DETECTED

",$seqi,"\n", substr($seqids[$seqi],0,20),"",$model_names[$mis_sorted[0]],"",(($phobius_table[0]->[1] eq 'SIGNAL')?'Y':'N'),"",(($phobius_table[$#phobius_table-1]->[1] eq 'TRANSMEM') && ($phobius_table[$#phobius_table]->[4] eq 'CYTOPLASMIC'))?'Y':'N',"",$segm->{'score'},"",$segm->{'P'},"",($segm->{'P_adj'}>0)? $segm->{'P_adj'}: "0/1000","
\n", "

Sequence

\n", " id: $seqids[$seqi]\n", "
annotation: $seqannotations[$seqi]\n", "

HMM hits

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n"; for my $mi(@mis_sorted){ if($scores[$mi]< 0 || $mi==0){ last; } print $h " \n", " \n", " \n", " \n", " \n"; # printing seq to HMM alignment one hsp at a time print $h " \n", " \n", " \n"; } print $h "
nameassession numberscoree-value
Alignment
",$model_names[$mi],"$model_an[$mi]$scores[$mi]$evalues[$mi]
\n"; my $domaini=1; while( $hsp = $hits[$mi]->next_hsp()){ print $h "

domain $domaini of ", $hits[$mi]->num_hsps(),"

\n", "
",
						 "Seq:",$hsp->hit_string,"\n",
						 "Hom:",$hsp->homology_string,"\n",
						 "HMM:",$hsp->query_string,
						 "
\n"; } print $h "
\n"; print $h "

Phobius

\n"; print $h "\n"; for(my $i=0; $i<=$#phobius_table; $i++){ my @row= @{$phobius_table[$i]}; print $h " \n"; for(my $j=0; $j<=$#row; $j++){ print $h ""; } print $h " \n"; } print $h "
$row[$j]
\n"; print $h "
\n"; } # Returns description of this program sub getProgDesc{ my %opt= %{shift(@_)}; my $html= $opt{'print'} eq 'html'; my $desc= "###################################################\n". (($html)? "
" : "" ) . "# locp.pl\n"; for my $k(keys %opt){ if($html){ $desc.="
"; } if($k eq 'V' || $k eq 'fileout' || $k eq 'C'){ $desc.= "#\t-$k:\t"; $desc.= (($opt{$k})? 'yes':'no'); $desc.= "\n"; } else{ $desc.= "#\t-$k:\t$opt{$k}\n"; } } $desc.= (($html)? "
" : "" ) . "###################################################\n"; return $desc; } # Returns Sum sub get_sum{ my @v= @{$_[0]}; my $s= 0; for my $e(@v){ $s+=$e; } return $s; } # Returns a subarray i1 to i2. sub subarray{ my @a= @{$_[0]}; my $i1= $_[1]; my $i2= $_[2]; my @sa; for(;$i1<=$i2; $i1++){ push(@sa, $a[$i1]); } return @sa; } # Returns number of true values in binary table sub count{ my @a= @{$_[0]}; my $n=0; for my $e(@a){ if($e){ $n++; } } return $n; } # Returns maximum index sub maxi{ my @v= @{$_[0]}; my $k=0; for(my $i=0; $i<=$#v; $i++){ if($v[$i]>$v[$k]){ $k= $i; } } return $k; } # Scans binary array for clusters of consecutive true-values and returns an array structure # specifying the segments found. # parameters: # 1. reference to binary array to be scanned # 2. maximum gap length # returns: # N x 2 matrix C specified as an array of array pointers, where each subarray @{$C[i]} is # a pair of numbers defining the starting, $C[i]->[0], and ending, $C[i]->[1], indices of # segment i. sub get_segments{ my @b= @{$_[0]}; my $maxgap= $_[1]; my @C; my $i1=0; my $i2=0; my $i3=0; loop1:while($i1<=$#b){ #print "$i1\n"; if(!$b[$i1]){ $i1++; } else{ $i2=$i1; loop2:while($i2<=$#b){ #print " $i2\n"; if($b[$i2]){ if($i2==$#b){ my @cluster= ($i1,$i2); push(@C,\@cluster); last loop1; # iterated till the end of array } else{ $i2++; } } else{ $i3=$i2; loop3:while($i3<=$#b){ #print " $i3\n"; if(($i3-$i2)>$maxgap){ my @cluster= ($i1,$i2-1); push(@C,\@cluster); $i1=$i3; last loop2; # back to loop1 } elsif($b[$i3]){ $i2=$i3; last loop3; # back to loop2 } elsif($i3==$#b){ my @cluster= ($i1,$i2-1); push(@C,\@cluster); last loop1; # iterated till the end of array } else{ $i3++; } } #close loop3 } } #close loop2 } } #close loop1 return @C; } # # Returns array specifying first and last indices of all contigs. # Contigs are identified by the first "|"-delimited value in the id field. # sub getContigs{ my @seqids= @{$_[0]}; my @contigs; my $contigid_b; my $contigid; my $i_b; my $i; $i_b= 0; $contigid_b= (split(/[|]/,$seqids[$i_b]))[1]; for($i=$i_b+1; $i<=$#seqids; $i++){ $contigid= (split(/[|]/,$seqids[$i]))[1]; if( !($contigid_b eq $contigid) ){ #print $contigid_b,"!=",$contigid,"\n"; my @contig= ($i_b,($i-1)); push(@contigs,\@contig); $i_b= $i; $contigid_b= $contigid; } } # last segment my @contig2= ($i_b,($i-1)); push(@contigs,\@contig2); return @contigs; } # # This function parses phobius output to a hash of tables, where each table # is implemented as an array of array pointers. # This function assumes that each ID field in phobius output begins with # a unique number followed by | character. This number is used as the key # for the corresponding table in the hash returned. # sub parsePhobius2PointerTable{ my $fin= shift; open(IN,$fin) or die "Can\'t open file $fin: $!\n"; my $line; ;;; # Reference lines my %table; my @record; my $inrecord= !1; my $id; my @splitted; while($line=){ chomp($line); $line =~ s/[.]$//; if($line =~ /^['ID']/){ # entering record @splitted= split(/[' ']+/,$line); @splitted= split(/[|]/,$splitted[1]); $id= $splitted[0]; @record=(); $inrecord= 1; } elsif($line eq '//'){ if($inrecord){ # exiting record my @copy= @record; $table{$id}= \@copy; $inrecord= !1; } } else{ my @row= split(/[' ']{2,}/,$line); push(@record, \@row); } } return %table; } # # Filters out top level windows, these are windows that are not included in any other window. # Takes as argument table of windows. Returns table of top level windows. # sub filtertopwin{ my @table= @{shift(@_)}; my @table_top; row:for my $win1(@table){ col:for my $win2(@table){ if($win1 == $win2){ next col; } if( ($win2->{'i_begin'}) <= ($win1->{'i_begin'}) && ($win1->{'i_end'}) <= ($win2->{'i_end'}) ){ # window 1 is included in the window 2 next row; } } # window is not included in any other window push(@table_top,$win1); } return @table_top; } # # Returns a subset of PLDRs that pass thresholds sub thresholdPLDRs{ my @pldrs= @{shift(@_)}; my %opt= %{shift(@_)}; my @pldrs_th; for my $pldr(@pldrs){ if( ($pldr->{'i_end'} - $pldr->{'i_begin'} + 1) >= $opt{'L'} && $pldr->{'score'} >= $opt{'S'} && $pldr->{'P'} <= $opt{'P'} && $pldr->{'P_adj'} <= $opt{'P_adj'} ){ push(@pldrs_th,$pldr); } } return @pldrs_th; }