main();
#--------------------------------------------------------------------
sub main
#--------------------------------------------------------------------
{
if (@ARGV != 2) {
print
"Usage: $0 <tracefile>
<plusexpfile>\n",
"Takes a single trace
file created by ABI and looks at the position of the\n",
"indicated SNP. Output
includes two haplotpyes indicating the SNP allele\n",
"associated with the
indel plus and minus alleles.\n";
exit 0;
}
#adjusting working environment
my ($tracefile, $plusexpfile) = @ARGV;
my $expResult = Read_Exp($plusexpfile);
my $traceResult = Read_Trace($tracefile);
my $bestmatch = Align($traceResult->{base1Ref}, $expResult->{EXPRef},
$expResult->{startdel});
if ($bestmatch == 0) {
print "unable to locate the deletion
site in the trace file\n";
die;
}
my $align = $bestmatch - $expResult->{startdel};
print "hence, to align the expected and the trace sequences
a $align\-base difference is accounted for\.\n";
#analyze SNPs
for (my $SNP=1; $SNP <= $expResult->{SNPtotal}; $SNP++)
{
if ($expResult->{SNPBaseRef}->[$SNP]
> $expResult->{startdel}) {
print "--------------------------------------------------------\n";
print "Analysis for
SNP$SNP, alleles $expResult->{AllelesARef}->[$SNP] and $expResult->{AllelesBRef}->[$SNP]\n";
print "--------------------------------------------------------\n";
GenotypeSNP("Del",
$expResult->{EXPRef}, $traceResult->{base1Ref}, $traceResult->{base2Ref},
$expResult->{SNPBaseRef}, $align, $SNP, $expResult->{AllelesARef}->[$SNP],
$expResult->{AllelesBRef}->[$SNP], $expResult->{shift_size});
GenotypeSNP("Ins",
$expResult->{EXPRef}, $traceResult->{base1Ref}, $traceResult->{base2Ref},
$expResult->{SNPBaseRef}, $align, $SNP, $expResult->{AllelesARef}->[$SNP],
$expResult->{AllelesBRef}->[$SNP], $expResult->{shift_size});
}
}
} #main end
#--------------------------------------------------------------------
sub Read_Exp
#--------------------------------------------------------------------
{
my ($plusexpfile) =@_;
my $expbase=0;
my $SNP=0;
my $SNPflag=0;
my $startdel;
my $shift_size=0;
my @EXP;
my @SNPBase;
my @AllelesA;
my @AllelesB;
$EXP[0]=0;
open (EXP, $plusexpfile) or die $!;
while (<EXP>) {
chomp;
foreach my $char (split / */) {
$expbase++;
$EXP[$expbase] = uc($char);
#print "base $expbase
is $EXP[$expbase]";
if (ord($EXP[$expbase])>125)
{
$expbase--;
}
if (ord($EXP[$expbase])==
47) {
$expbase++;
}
if (ord($EXP[$expbase])<65)
{
$expbase--;
}
if ((ord($EXP[2])==
66) or (ord($EXP[1])== 46)) {
print "Expected sequence is badly formatted, ABI or SCF trace file?\n";
die;
}
if ($SNPflag == 4)
{
$expbase--;
$SNPflag=0;
}
if ($SNPflag == 3)
{
$AllelesB[$SNP] = $EXP[$expbase];
$SNPflag=4;
$expbase--;
print "SNP$SNP is at base $SNPBase[$SNP], alleles $AllelesA[$SNP] and $AllelesB[$SNP]\.\n";
}
if ($SNPflag == 2)
{
$expbase--;
$SNPflag=3;
}
if ($SNPflag == 1)
{
$AllelesA[$SNP] = $EXP[$expbase];
$SNPBase[$SNP] = $expbase;
$SNPflag=2;
}
if (ord($char) ==
123) {
$expbase--;
$startdel=$expbase;
}
if (ord($char) ==
125) {
$expbase--;
$shift_size=$expbase-$startdel;
print "In the expected sequence the deletion is after base $startdel, indel
size $shift_size bases\.\n";
}
if (ord($char) ==
91) {
$SNP++;
$SNPflag=1;
$expbase--;
}
}
}
close EXP;
if ($SNP*$shift_size < 1) {
print "Expected sequence is badly formatted\n";
die;
}
my %result = (
SNPtotal => $SNP,
shift_size => $shift_size,
startdel => $startdel,
EXPRef => \@EXP,
SNPBaseRef => \@SNPBase,
AllelesARef => \@AllelesA,
AllelesBRef => \@AllelesB,
);
return \%result;
} #Read_Exp ends
#--------------------------------------------------------------------
sub Read_Trace
#--------------------------------------------------------------------
{
my ($tracefile) =@_;
my $temp_poly_file = "$tracefile".'.poly';
my @base1;
my @base2;
my @q_base1;
my @q_base2;
$ENV{PHRED_PARAMETER_FILE} = $PHRED_DAT;
system("$PHRED_EXE -nosplit -dd $temp_path $tracefile");
open (PH, $temp_poly_file) or die $!;
my @PH=<PH>;
shift @PH;
my $i=0;
for (@PH) {
$i++;
my @chars= split / /;
$base1[$i] = $chars[0];
$q_base1[$i] = $chars[3];
$base2[$i] = $chars[4];
$q_base2[$i] = $chars[7];
if ($base2[$i] eq "N") {
$base2[$i] = $base1[$i];
$q_base2[$i] = $q_base1[$i];
}
}
close PH;
my %result = (
base1Ref => \@base1,
base2Ref => \@base2,
);
return \%result;
} # Read_Trace ends
#--------------------------------------------------------------------
sub Align
#--------------------------------------------------------------------
{
my ($base1Ref,$EXPRef,$startdel) = @_;
my $base_sign1=$EXPRef->[$startdel-4].$EXPRef->[$startdel-3].$EXPRef->[$startdel-2].$EXPRef->[$startdel-1].$EXPRef->[$startdel];
my $base_sign2=$EXPRef->[$startdel-9].$EXPRef->[$startdel-8].$EXPRef->[$startdel-7].$EXPRef->[$startdel-6].$EXPRef->[$startdel-5];
my $base_sign3=$EXPRef->[$startdel-14].$EXPRef->[$startdel-13].$EXPRef->[$startdel-12].$EXPRef->[$startdel-11].$EXPRef->[$startdel-10];
my $base_sign4=$EXPRef->[$startdel-19].$EXPRef->[$startdel-18].$EXPRef->[$startdel-17].$EXPRef->[$startdel-16].$EXPRef->[$startdel-15];
my @matches1;
my @matches2;
my @matches3;
my @matches4;
$matches1[1]=0;
$matches2[1]=0;
$matches3[1]=0;
$matches4[1]=0;
my $match1=0;
my $match2=0;
my $match3=0;
my $match4=0;
my $matched_base=20;
while (<@$base1Ref>) {
$matched_base++;
my $trace_sign = $base1Ref->[$matched_base-4].$base1Ref->[$matched_base-3].$base1Ref->[$matched_base-2].$base1Ref->[$matched_base-1].$base1Ref->[$matched_base];
if ($base_sign1 eq $trace_sign) {
$match1++;
$matches1[$match1]=$matched_base;
}
if ($base_sign2 eq $trace_sign) {
$match2++;
$matches2[$match2]=$matched_base+5;
}
if ($base_sign3 eq $trace_sign) {
$match3++;
$matches3[$match3]=$matched_base+10;
}
if ($base_sign4 eq $trace_sign) {
$match4++;
$matches4[$match4]=$matched_base+15;
}
}
#calculate score
if ($matches1[1]+$matches2[1]+$matches3[1]+$matches4[1]
== 0) {
print "unable to locate the deletion
site in the trace file\n";
die;
}
my $score=0;
my $bestmatch=0;
my $bestscore=0;
while (<@matches1>) {
my $currentmatch=$_;
$score=40;
while (<@matches2>) {
$score=$score+30/(abs($_-$currentmatch)+0.1);
}
while (<@matches3>) {
$score=$score+20/(abs($_-$currentmatch)+0.1);
}
while (<@matches4>) {
$score=$score+10/(abs($_-$currentmatch)+0.1);
}
if ($score > $bestscore) {
$bestscore=$score;
$bestmatch = $currentmatch;
}
}
while (<@matches2>) {
my $currentmatch=$_;
$score=30;
while (<@matches3>) {
$score=$score+20/(abs($_-$currentmatch)+0.1);
}
while (<@matches4>) {
$score=$score+10/(abs($_-$currentmatch)+0.1);
}
if ($score > $bestscore) {
$bestmatch = $currentmatch;
}
}
while (<@matches3>) {
my $currentmatch=$_;
$score=20;
while (<@matches4>) {
$score=$score+10/(abs($_-$currentmatch)+0.1);
}
if ($score > $bestscore) {
$bestmatch = $currentmatch;
}
}
while (<@matches4>) {
my $currentmatch=$_;
$score=10;
if ($score > $bestscore) {
$bestmatch = $currentmatch;
}
}
print "In the trace file the deletion site detected at
position $bestmatch, score ";
printf("%.1f",$bestscore);
print " (lower than 600 may indicate low trace file quality)\n";
return $bestmatch;
}
#--------------------------------------------------------------------
sub GenotypeSNP
#--------------------------------------------------------------------
{
my ($type, $EXPRef, $base1Ref, $base2Ref, $SNPBaseRef,
$align, $SNP, $AllelesA, $AllelesB, $shift_size) = @_;
#$type == "Ins" or "Del"
print "Finding the SNP$SNP allele that is located on the
$type chromosome\n";
my $inverseType = $type eq "Ins" ? "Del" : "Ins";
my $genotypeSNP = 'N';
my $traceSNPbase = $SNPBaseRef->[$SNP] + $align;
if ($type eq "Del") {
$shift_size = -$shift_size;
$traceSNPbase += $shift_size;
}
my $delbase = $EXPRef->[$SNPBaseRef->[$SNP]+$shift_size];
print "The expected pair of bases before the SNP site is
$EXPRef->[$SNPBaseRef->[$SNP]+$shift_size-1] $EXPRef->[$SNPBaseRef->[$SNP]-1]\n";
print "In the trace, the pairs of bases are:\n";
for (my $i = -2; $i <= 2; $i++) {
my $base = $traceSNPbase + $i;
print "$base base1 $base1Ref->[$base]
base2 $base2Ref->[$base]";
if ($i == 0) {
print " are in the
expected SNP site";
}
print "\n";
}
#check validity
my $base_beforeEXP = ord($EXPRef->[$SNPBaseRef->[$SNP]+$shift_size-1])
+ ord($EXPRef->[$SNPBaseRef->[$SNP]-1]);
my $base_beforeTrace = ord($base1Ref->[$traceSNPbase-1])+ord($base2Ref->[$traceSNPbase-1]);
my $validflag = 0;
if ($base_beforeEXP != $base_beforeTrace) {
$validflag = 10;
my $bibase_beforeTrace = ord($base1Ref->[$traceSNPbase-2])
+ ord($base2Ref->[$traceSNPbase-2]);
my $base_afterTrace = ord($base1Ref->[$traceSNPbase])
+ ord($base2Ref->[$traceSNPbase]);
if ($bibase_beforeTrace != $base_afterTrace)
{
if ($base_beforeEXP
== $bibase_beforeTrace) {
$validflag = -1;
}
if ($base_beforeEXP
== $base_afterTrace) {
$validflag = 1;
}
}
}
if ($validflag == 10) {
print "Haplotype 1 is: can not align
sequences\n";
} else {
$traceSNPbase += $validflag;
print "The pair of bases that should
precede the site of SNP on the $type chromosome was located at $validflag
bases from the expected position \n";
print "hence, the site of SNP for the
$type chromosome is at position $traceSNPbase base1 $base1Ref->[$traceSNPbase]
base2 $base2Ref->[$traceSNPbase] of these the expected base for the $inverseType
chromosome is $delbase\n";
if ($base1Ref->[$traceSNPbase] eq $delbase)
{
$genotypeSNP = $base2Ref->[$traceSNPbase];
} else {
$genotypeSNP = $base1Ref->[$traceSNPbase];
}
if ($genotypeSNP eq $AllelesA or $genotypeSNP
eq $AllelesB) {
if ($type eq "Ins")
{
print "Haplotype 2 is: Ins, $genotypeSNP\n";
} else {
print "Haplotype 1 is: Del, $genotypeSNP\n";
$EXPRef->[$SNPBaseRef->[$SNP]]=$genotypeSNP;
}
} else {
if ($type eq "Ins")
{
print "Haplotype 2 is: can not align sequences\n";
} else {
print "Haplotype 1 is: can not align sequences\n";
}
}
}
print "______________________________________\n";
}