#!/usr/local/bin/perl
#
#
# Ver 2.0 by
# Eyal Seroussi <seroussi@agri.huji.ac.il>
# Darek Kedra <darked@burnham.org>
#
#
#--------------------------------------------------------------------
#these three are platform dependent:
#
my $PHRED_EXE = '/usr/local/bin/phred';
my $PHRED_DAT = '/usr/local/etc/phredpar.dat';
my $temp_path = '/tmp/frsh_files/';
# default values possible to be modified
my $MIN_SHIFT_SIZE = 1;
my $MAX_SHIFT_SIZE = 25;
#--------------------------------------------------------------------
unless (@ARGV == 5)
{
print <<EOH;
Usage: frsh tracefile user_name expect first-base last-base
Takes a single trace file created by ABI and looks for a frameshift.
Out-put includes frameshift size and the putative sequence of the molecule.
EOH
exit 0;
}
unless ($ARGV[3] < $ARGV[4])
{
print <<EOH;
Wrong cut-off data.
EOH
exit 0;
}
#--------------------------------------------------------------------
#main
#adjusting working environment
my $ABI_file = $ARGV[0];
my $file_suffix = $ARGV[1];
my $expect = $ARGV[2];
my $fbase = $ARGV[3];
my $lbase = $ARGV[4];
my $temp_ABI_file = "$temp_path".'frshtrace'."$file_suffix";
my $temp_poly_file = "$temp_ABI_file".'.poly';
my $fof_name = "$temp_path"."$file_suffix".'.fof';
system("rm -f $fof_name");
system("touch $fof_name");
system("cp $ABI_file $temp_ABI_file");
run_phred();
read_poly();
#Screen for shifts in range of 1 to 25 bases using Check_Shift subroutine
for ($shift_size = $MIN_SHIFT_SIZE; $shift_size <= $MAX_SHIFT_SIZE;
$shift_size++ )
{
Check_Shift();
print "\n";
}
#main ends
#--------------------------------------------------------------------
sub run_phred
#--------------------------------------------------------------------
{
$ENV{PHRED_PARAMETER_FILE} = $PHRED_DAT;
system("$PHRED_EXE -dd $temp_path $temp_ABI_file");
}#sub run_phred ends
#--------------------------------------------------------------------
sub read_poly
#--------------------------------------------------------------------
{
open(POLY_FILE,"$temp_poly_file") or die $!;
my @POLY_FILE_ARRAY = <POLY_FILE>;
close POLY_FILE;
shift @POLY_FILE_ARRAY;
my $base_number = 1;
for (@POLY_FILE_ARRAY)
{
#preparing bases for comparison
@line_record = split / /;
if ($base_number <= $lbase)
{
if ($base_number >= $fbase)
{
my $pbase_number=
$base_number-$fbase+1;
$base1[$pbase_number]
= $line_record[0];
$base1_ra[$pbase_number]
= $line_record[3];
$base2[$pbase_number]
= $line_record[4];
$base2_ra[$pbase_number]
= $line_record[7];
if ($base2[$pbase_number]
eq "N")
{
$base2[$pbase_number] = $base1[$pbase_number];
$base2_ra[$pbase_number] = $base1_ra[$pbase_number];
}
}
}
$base_number++;
} #end for (@POLY_FILE_ARRAY)
}#sub read_poly ends
#--------------------------------------------------------------------
sub Check_Shift
#--------------------------------------------------------------------
{
for (my $base_number = 1; $base_number < $#base1; $base_number++)
{
$prediction1[$base_number]
= $prediction2[$base_number] = "N";
$prediction1_ra[$base_number] = $prediction2_ra[$base_number]
= $taken[$base_number] = 0;
}
#The @taken array acts as an array of flags. If a base was taken to
#prediction 1 the program should know it in the future to improve its
predictions.
for ($base_number = 1; $base_number < $#base1; $base_number++)
{
#Making the match across $shift_size, if there
is one, it is stored in the
#prediction sequences:
#there are 4 possibilities for each position:
A, B, C, D.
#start matching only if the molecule is long
enough. (4 possibilities:)
if ($base_number > $shift_size + 1 )
{
#(long enough:)
my $offset_base_no = $base_number
- $shift_size;
my $current_base1 = $base1[$base_number];
my $current_base1_ra = $base1_ra[$base_number];
my $current_base2 = $base2[$base_number];
my $current_base2_ra = $base2_ra[$base_number];
###########################################################################
#A- the first bases of the
2 calls are identical B- the first base and the second one are equal:
if (($current_base1 eq $base1[$offset_base_no])
or ($current_base1 eq $base2[$offset_base_no]))
{
#(possibility
A and B:)
$prediction1[$base_number]
= $current_base1;
$prediction1_ra[$base_number]
= $current_base1_ra;
$prediction2[$base_number]
= $current_base2;
$prediction2_ra[$base_number]
= $current_base2_ra;
$taken[$base_number]
= 1;
}#(possibility
A and B)
#C- the second base and
the first one are equal:
#should be checked only
if there is a real double peak at the first call
if ($current_base2 ne $current_base1)
{
#(double
peak:)
if ($current_base2
eq $base1[$offset_base_no])
{
#(possibility C:)
if ($prediction1[$base_number] ne "N")
{
#if there is no "N" there is already prediction for that base.
#it means that there are 2 logical predictions for this
#base and the program annotates that using the word "or"
#the program then try to decide which is better and put it in first place
#it takes into consideration if the base from the second call has
#already been assigned (two solutions by C:)
if ($taken[$offset_base_no] == 0)
{
#if the "taken" flag is not raised it means that both bases are possible
and
#the high quality one will be first (not taken:)
$prediction1[$base_number] = $prediction1[$base_number].' or '.$current_base2;
$prediction2[$base_number] = $prediction1[$base_number];
}#not taken
else
{
#if the "taken" flag is raised it is less likely that base 1 is the
#right one as it is already taken to another position
#and the places should be switched unless the quality of that second
#base is really dubious (taken)
if (($base1_ra[$offset_base_no] / $base2_ra[$offset_base_no])>30)
{
#quality dubious don't switch
$prediction1[$base_number] = $prediction1[$base_number].' or '.$current_base2;
$prediction2[$base_number] = $prediction1[$base_number];
}
else
{
#quality OK to switch
$prediction1[$base_number] = $current_base2.' or '.$prediction1[$base_number];
$prediction2[$base_number] = $prediction1[$base_number];
}
}#taken
}#two solutions by C
else
{
#(one solution by C:)
$prediction1[$base_number] = $current_base2;
$prediction1_ra[$base_number] = $current_base2_ra;
$prediction2[$base_number] = $current_base1;
$prediction2_ra[$base_number] = $current_base1_ra;
$taken[$base_number] = 1;
}#(one solution by C)
}#(possibility C ends)
#D- the second bases of the 2 calls are identical:
if ($base1[$offset_base_no] ne $base2[$offset_base_no])
{
#(double peak in second call:)
if ($current_base2 eq $base2[$offset_base_no])
{
#(possibility D:)
if ($prediction1[$base_number] ne "N")
{
#(two solutions by D:)
if ($taken[$offset_base_no] == 0)
{
$prediction1[$base_number] = $prediction1[$base_number].' or '.$current_base2;
$prediction2[$base_number] = $prediction1[$base_number];
}
else
{
if (($base1_ra[$offset_base_no]/$base2_ra[$offset_base_no])>30)
{
$prediction1[$base_number] = $prediction1[$base_number].' or '.$current_base2;
$prediction2[$base_number] = $prediction1[$base_number];
}
else {
$prediction1[$base_number] = $current_base2.' or '.$prediction1[$base_number];
$prediction2[$base_number] = $prediction1[$base_number];
}
}
}#(two solutions by D)
else
{
#(one solution by D:)
$prediction1[$base_number] = $current_base2;
$prediction1_ra[$base_number] = $current_base2_ra;
$prediction2[$base_number] = $current_base1;
$prediction2_ra[$base_number] = $current_base1_ra;
$taken[$base_number] = 1;
}#(one solution by D)
}#(possibility D ends)
}#(double peak in second call)
}#(double
peak)
}#(long enough)
#changing the format to
single letter format
$prediction1[$base_number]
= single_letter_code($prediction1[$base_number]);
$prediction2[$base_number]
= single_letter_code($prediction2[$base_number]);
}#(4 possibilities:)
calc_probability();
}# sub Check_Shift ends
###########################################################################
#--------------------------------------------------------------------
sub calc_probability
#--------------------------------------------------------------------
{
#finding the site of shift
#preparing working environment, the variable min_shift_pro indicates
the existence
#of a shift when small than 0.001. It is calculated for each position
by comparing
#prediction 1 and 2 across the shift. Since the chance to a similar
pair is 0.25
#(4 pairs of 16 possibilities) this variable is divided by 4 when a
match
#is encountered. This variable is multiplied by 4, which is an empirical
penalty
#value for a mismatch. A fail to predict (letter N in prediction) is
#regarded as a mismatch.
my $shift_found = 'none';
my $min_probab_found = 'none';
my $min_shift_pro = 1;
for ( my $base_number = 1; ($base_number < $#prediction1 -
(10 + $shift_size)); $base_number++)
{
#(estimating min_shift_pro:)
$probability[$base_number] = 1;
for (my $base_offset = 0; $base_offset < 10;
$base_offset++ )
{
#estimating shift probability
p[$base_number] by 10 bases
if ($prediction1[$base_number
+ $shift_size + $base_offset] eq 'N')
{
$probability[$base_number]
= $probability[$base_number]*4;
}
else
{
if ($prediction1[$base_number
+ $shift_size + $base_offset] eq $prediction2[$base_number + $base_offset])
{
$probability[$base_number] = $probability[$base_number]*0.25;
}
else
{
$probability[$base_number] = $probability[$base_number]*4;
}
}
}
# print "$base_number p=$probability[$base_number]\n";
if ($probability[$base_number]
< $min_shift_pro)
{
$min_shift_pro
= $probability[$base_number];
if ($probability[$base_number]
< 0.001)
{
$shift_found = 'yes';
}
}
if ($min_probab_found eq
'none')
{
if ($shift_found
eq 'yes')
{
if ($probability[$base_number] >= $probability[$base_number-1])
{
$min_probab_found = 'yes';
#recording the site of shift to be reported
$shift_start_pro = $probability[$base_number];
$shift_start = $base_number + $shift_size;
}
}
}
}#(estimating min_shift_pro ends)
###################################################################################
#print summary
#result are reported only if $min_shift_pro is less then expect value
somewhere
#along the predicted molecule. However
#in such case the start of shift will be reported at first place
#that probability dropped below 0.001
$f_min_shift_pro = sprintf "%0.2e",$min_shift_pro;
if ($min_shift_pro > $expect) {$shift_found = 'none';}
if ($shift_found eq 'yes')
{
$f_min_shift_pro = sprintf "%0.2e",$min_shift_pro;
$f_shift_start_pro = sprintf "%0.2e",$shift_start_pro;
print "Shift of $shift_size nucleotides was detected (Expect
$f_min_shift_pro)\n";
print "Shift starts at analyzed base $shift_start : probability
score $f_shift_start_pro\n";
print "The predicted sequence of the molecule following
the shift is:\n";
$fasta_file="$temp_path".$file_suffix.'_'."$shift_size".'.fasta';
open (FASTA_FILE,"> $fasta_file") or die $!;
print {FASTA_FILE} "> $file_suffix.$shift_size\n";
for ($i = $shift_start + 1; $i < $#prediction1; $i++)
{
print "$prediction1[$i]";
print {FASTA_FILE} "$prediction1[$i]";
if ((($i - $shift_start) % 60)
== 0){print {FASTA_FILE} "\n"; print "\n";}
}
print{FASTA_FILE} "\n";
print "\n";
close FASTA_FILE;
$ID = "$file_suffix".'_'."$shift_size";
fasta2exp($fasta_file, $ID);
}
else
{
print "Checks for $shift_size nucleotide shift : No shift
found (Expect $f_min_shift_pro)\n";
}
} #sub calc_probability ends
#--------------------------------------------------------------------
sub single_letter_code
#--------------------------------------------------------------------
{
my $input_base = $_[0];
my %bases_hash =
(
'A or G' => "R",
'G or A' => "r",
'A or C' => "M",
'C or A' => "m",
'A or T' => "W",
'T or A' => "w",
'C or G' => "S",
'G or C' => "s",
'C or T' => "Y",
'T or C' => "y",
'G or T' => "K",
'T or G' => "k",
'A' => 'A',
'C' => 'C',
'G' => 'G',
'T' => 'T',
'N' => 'N'
);
if (exists ($bases_hash{$input_base}))
{
$output_base = $bases_hash{$input_base};
}
else
{
print "single_letter_code_sub -> bad base: $input_base\n"
}
return $output_base;
}#sub single_letter_code ends
#--------------------------------------------------------------------
sub fasta2exp
#--------------------------------------------------------------------
{
my ($fasta_file, $ID) = @_;
my $exp_file = "$temp_path"."$ID".'.exp';
%IUPAC_hash =
(
#iupac base, regular base, tag
'R' => ['A' , 'A or G'],
'r' => ['G' , 'A or G'],
'M' => ['A' , 'A or C'],
'm' => ['C' , 'A or C'],
'H' => ['N' , 'C,A or T '],
'h' => ['N' , 'C,A or T '],
'W' => ['A' , 'A or T'],
'w' => ['T' , 'A or T'],
'D' => ['N' , 'A,G or T '],
'd' => ['N' , 'A,G or T '],
'S' => ['C' , 'C or G'],
's' => ['G' , 'C or G'],
'B' => ['N' , 'C,G or T '],
'b' => ['N' , 'C,G or T '],
'Y' => ['C' , 'C or T'],
'y' => ['T' , 'C or T'],
'N' => ['N' , 'A,G,T or C'],
'n' => ['N' , 'A,G,T or C'],
'K' => ['G' , 'G or T'],
'k' => ['T' , 'G or T'],
'V' => ['N' , 'A,C or G '],
'v' => ['N' , 'A,C or G '],
'A' => ['A' , 'none' ],
'C' => ['C' , 'none' ],
'G' => ['G' , 'none' ],
'T' => ['T' , 'none' ]
);
open (FASTA_FILE, $fasta_file) or die $!;
my @fasta_lines = <FASTA_FILE>;
close FASTA_FILE;
shift @fasta_lines;
my (@exp_tags, @exp_sequence);
for (@fasta_lines)
{
chomp;
@input_bases = split //;
for my $input_base (@input_bases)
{
if (exists ($IUPAC_hash{$input_base}))
{
my @values
= @{$IUPAC_hash{$input_base}};
my $output_base
= $values[0];
my $output_tag
= $values[1];
push @exp_sequence,
$output_base;
push @exp_tags,
$output_tag;
}
else {print " fasta2exp:
bad base: $input_base\n"}
}
}
$exp[0]= 'ID '."$ID";
$exp[1]= 'EN '."$ID";
$exp[2]= 'LN '."$ID";
$exp[3]= 'LT PLN';
$exp[4]= 'QR '."$#exp_sequence";
$exp[5]= 'AQ 0.000000';
$exp[6]= 'SQ';
$exp[7]= ' ';
$line_exp_no = 7;
for ($i = 1; $i <= $#exp_sequence; $i++)
{
$exp[$line_exp_no] = "$exp[$line_exp_no]"."$exp_sequence[$i]";
if (($i % 60) == 0)
{
$line_exp_no++;
$exp[$line_exp_no]='
';
}
elsif (($i % 10) == 0){$exp[$line_exp_no] = "$exp[$line_exp_no]".'
'; }
}
$line_exp_no++;
$exp[$line_exp_no] = '//';
for ($i = 1; $i <= $#exp_sequence; $i++)
{
if ($exp_tags[$i] ne 'none')
{
$line_exp_no++;
$exp[$line_exp_no] = 'TG
COMM + '."$i".'..'."$i";
$line_exp_no++;
$exp[$line_exp_no] = 'TG
'."$exp_tags[$i]";
}
}
open (EXP_FILE,">$exp_file") or die $!;
for (@exp)
{
print {EXP_FILE} "$_";
print {EXP_FILE} "\n";
}
close EXP_FILE;
open (FOF_FILE,">>$fof_name") or die $!;
print {FOF_FILE} "$ID\n";
close FOF_FILE;
} #sub fasta2exp ends