#!/usr/bin/perl use Bio::Seq; use Bio::SeqIO; use Getopt::Long; use FindBin qw($Bin); use strict; use warnings; if(@ARGV < 1) { die "fasta-file codon_table codon_file\n"; } &main; #################################################################################################### #main program sub main { my($seqio); my($line) = ""; my($seq) = ""; my($revcomp) = ""; my($head) = "null"; my($len) = ""; my($frame_seq) = ""; my($codonfile) = ""; my($codon) = ""; my($codontab) = 11; my(%frame) = (); my(%result) = (); my(@temptable) = (); my(%codontable) = (); my(%results) = (); my $L= 50; GetOptions('L=i' => \$L); #################################################################################################### #read the given codon table from file if(defined($ARGV[1])) { $codontab = $ARGV[1]; } if(defined($ARGV[2])) { $codonfile = $ARGV[2]; } else { $codonfile = "$Bin/codon_tables_ncbi.txt"; } if(-e $codonfile) { open(TABLE, $codonfile) or die "$codonfile not opened\n"; while($line = ) { if($line =~ m/\#$codontab\./) { for(my($i) = 0; $i < 7; ++$i) { $line =
; $line =~ s/^.*=\s*//g; $line =~ s/\s+$//g; if(length($line) > 0) { #print "$line\n"; push(@temptable, $line); } } } } close TABLE; if(scalar(@temptable) > 0) { for(my($i) = 0; $i < length($temptable[0]); ++$i) { $codon = substr($temptable[2], $i, 1)."".substr($temptable[3], $i, 1)."".substr($temptable[4], $i, 1); $codontable{$codon}{'aa'} = substr($temptable[0], $i, 1); $codontable{$codon}{'start'} = substr($temptable[1], $i, 1); #print "$codon\t".substr($temptable[1], $i, 1)."\t".substr($temptable[0], $i, 1)."\n"; } } } else { die "$codonfile not found\n"; } #################################################################################################### #read sequences if(-e $ARGV[0]) { $seqio = Bio::SeqIO->new( '-format' => 'fasta' , -file => $ARGV[0]); } else { die "$ARGV[0] not found\n"; } while ( my $seqobj = $seqio->next_seq ) { $len = $seqobj->length(); $head = $seqobj->id(); $seq = uc($seqobj->seq()); $revcomp = reverse($seq); $revcomp =~ tr/ACGT/TGCA/; %results = (); for(my($i) = 0; $i < 3; ++$i) { $frame_seq = substr($seq, $i); &translate($frame_seq, \%codontable, \%result, $head, ($i+1), \%results, $L); $frame_seq = substr($revcomp, $i); &translate($frame_seq, \%codontable, \%result, $head, (($i+1)*-1), \%results, $L); } for my $key (sort { $a <=> $b; } (keys(%results))) { print "$results{$key}{'header'}"; print "$results{$key}{'seq'}"; } } #################################################################################################### }; #################################################################################################### #finds always the longest AA-sequence sub translate { my($seq, $conv_table, $result, $name, $frame, $res, $L) = @_; my($piece) = ""; my($aa_seq) = ""; my($last) = ""; my($orf) = ""; my($pos) = 0; my($end_char) = ""; my($start_end_string) = ""; my($len) = length($seq); ################################################################################################## #convert to aa (for Emmanuel, start codons must be stored into another hash as otherwise #all start are assumed to encode M, that is not true $aa_seq = "X"; $start_end_string = "M"; for(my($i) = 0; $i < length($seq); $i = $i + 3) { $piece = substr($seq, $i, 3); if(length($piece) == 3 ) { if(exists(${$conv_table}{$piece})) { $aa_seq .= ${$conv_table}{$piece}{'aa'}; if((${$conv_table}{$piece}{'start'} eq 'M')) { $start_end_string .= "M"; } elsif(${$conv_table}{$piece}{'aa'} eq '*') { $start_end_string .= "*"; } else { $start_end_string .= "-"; } } else { $aa_seq .= "X"; $start_end_string .= "-"; } } } $aa_seq .= "*"; $start_end_string .= "*"; ################################################################################################### #get all orf longer than $L aa (gives the longest possible orf-always!!!) #print "$aa_seq\n\n"; #print "$start_end_string\n\n"; my($temp_aa) = ""; my($start_map) = ""; my($start_coord) = ""; my($alt_start) = ""; my($sub_seq) = 0; my($namecounter) = 0; while($start_end_string =~ m/M.*?\*/g) { if(length($&) >= $L) { $start_map = $&; if($frame <= 0) { $start_coord = ($len+abs($frame)) - ((pos($start_end_string)*3)+abs($frame)) + 4 #extra codon; } else { $start_coord = (pos($start_end_string)*3)-(length($&)*3)+abs($frame) - 3; } $temp_aa = substr($aa_seq, pos($start_end_string)-length($&), length($&)); $temp_aa =~ s/^./M/; $temp_aa =~ s/.{60}/$&\n/g; $temp_aa =~ s/^\s+|\s+$//g; if(!(exists(${$res}{"$start_coord.0"}))) { ${$res}{"$start_coord.0"}{'header'} = ">$name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)." X ".length($aa_seq)."\n"; ${$res}{"$start_coord.0"}{'seq'} = "$temp_aa\n"; #print "NON: $name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n"; } else { $namecounter = 0; while((exists(${$res}{$start_coord.".".$namecounter}))) { ++$namecounter; } ${$res}{$start_coord.".".$namecounter}{'header'} = ">$name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n"; ${$res}{$start_coord.".".$namecounter}{'seq'} = "$temp_aa\n"; #print STDERR "DUBLICATED: $namecounter $name\_orf_frame:$frame\_pos:$start_coord\_len:".length($start_map)."\n"; } } } }; ##################################################################################################### #