#!/usr/local/bin/perl
#
#
# Ver 1.1 by
# Yanir Seroussi and
# Eyal Seroussi <seroussi@agri.huji.ac.il>
#
#--------------------------------------------------------------------
#these three  are platform dependent:
my $PHRED_EXE  =  '/cowry_data/PHRED.DIR/phred';
my $PHRED_DAT  =  '/cowry_data/PHRED.DIR/phredpar.dat';
my $temp_path  =  '/cowry_data/WEB.DIR/';

main();

#--------------------------------------------------------------------
sub main
#--------------------------------------------------------------------
{
   if (@ARGV != 2) {
      print
         "Usage: $0 <tracefile> <plusexpfile>\n",
         "Takes a single trace file created by ABI and looks at the position of the\n",
         "indicated SNP. Output includes two haplotpyes indicating the SNP allele\n",
         "associated with the indel plus and minus alleles.\n";
      exit 0;
   }

   #adjusting working environment
   my ($tracefile, $plusexpfile) = @ARGV;

   my $expResult = Read_Exp($plusexpfile);
   my $traceResult = Read_Trace($tracefile);
   my $bestmatch = Align($traceResult->{base1Ref}, $expResult->{EXPRef}, $expResult->{startdel});

   if ($bestmatch == 0) {
      print "unable to locate the deletion site in the trace file\n";
      die;
   }

   my $align = $bestmatch - $expResult->{startdel};
   print "hence, to align the expected and the trace sequences a $align\-base difference is accounted for\.\n";

   #analyze SNPs
   for (my $SNP=1; $SNP <= $expResult->{SNPtotal}; $SNP++) {
      if ($expResult->{SNPBaseRef}->[$SNP] > $expResult->{startdel}) {
         print "--------------------------------------------------------\n";
         print "Analysis for SNP$SNP, alleles $expResult->{AllelesARef}->[$SNP] and $expResult->{AllelesBRef}->[$SNP]\n";
         print "--------------------------------------------------------\n";
         GenotypeSNP("Del", $expResult->{EXPRef}, $traceResult->{base1Ref}, $traceResult->{base2Ref}, $expResult->{SNPBaseRef}, $align, $SNP, $expResult->{AllelesARef}->[$SNP], $expResult->{AllelesBRef}->[$SNP], $expResult->{shift_size});
         GenotypeSNP("Ins", $expResult->{EXPRef}, $traceResult->{base1Ref}, $traceResult->{base2Ref}, $expResult->{SNPBaseRef}, $align, $SNP, $expResult->{AllelesARef}->[$SNP], $expResult->{AllelesBRef}->[$SNP], $expResult->{shift_size});
      }
   }
} #main end

#--------------------------------------------------------------------
sub Read_Exp
#--------------------------------------------------------------------
{
   my ($plusexpfile) =@_;
   my $expbase=0;
   my $SNP=0;
   my $SNPflag=0;
   my $startdel;
   my $shift_size=0;
   my @EXP;
   my @SNPBase;
   my @AllelesA;
   my @AllelesB;
   $EXP[0]=0;
   open (EXP, $plusexpfile) or die $!;
   while (<EXP>) {
      chomp;
      foreach my $char (split / */) {
         $expbase++;
         $EXP[$expbase] = uc($char);
         #print "base $expbase is $EXP[$expbase]";
         if (ord($EXP[$expbase])>125) {
            $expbase--;
         }
         if (ord($EXP[$expbase])== 47) {
            $expbase++;
         }
         if (ord($EXP[$expbase])<65) {
            $expbase--;
         }
         if ((ord($EXP[2])== 66) or (ord($EXP[1])== 46)) {
            print "Expected sequence is badly formatted, ABI or SCF trace file?\n";
            die;
         }
         if ($SNPflag == 4) {
            $expbase--;
            $SNPflag=0;
         }
         if ($SNPflag == 3) {
            $AllelesB[$SNP] = $EXP[$expbase];
            $SNPflag=4;
            $expbase--;
            print "SNP$SNP is at base $SNPBase[$SNP], alleles $AllelesA[$SNP] and $AllelesB[$SNP]\.\n";
         }
         if ($SNPflag == 2) {
            $expbase--;
            $SNPflag=3;
         }
         if ($SNPflag == 1) {
            $AllelesA[$SNP] = $EXP[$expbase];
            $SNPBase[$SNP] = $expbase;
            $SNPflag=2;
         }
 
         if (ord($char) == 123) {
            $expbase--;
            $startdel=$expbase;
         }
         if (ord($char) == 125) {
            $expbase--;
            $shift_size=$expbase-$startdel;
            print "In the expected sequence the deletion is after base $startdel, indel size $shift_size bases\.\n";
         }
         if (ord($char) == 91) {
            $SNP++;
            $SNPflag=1;
            $expbase--;
         }
      }
   }
   close EXP;
   if ($SNP*$shift_size < 1) {
      print "Expected sequence is badly formatted\n";
      die;
   }
   my %result = (
      SNPtotal => $SNP,
      shift_size => $shift_size,
      startdel => $startdel,
      EXPRef => \@EXP,
      SNPBaseRef => \@SNPBase,
      AllelesARef => \@AllelesA,
      AllelesBRef => \@AllelesB,
   );

   return \%result;
} #Read_Exp ends

#--------------------------------------------------------------------
sub Read_Trace
#--------------------------------------------------------------------
{
   my ($tracefile) =@_;
   my $temp_poly_file  = "$tracefile".'.poly';
   my @base1;
   my @base2;
   my @q_base1;
   my @q_base2;
   $ENV{PHRED_PARAMETER_FILE} = $PHRED_DAT;
   system("$PHRED_EXE -nosplit -dd $temp_path $tracefile");
   open (PH, $temp_poly_file) or die $!;
   my @PH=<PH>;
   shift @PH;
   my $i=0;
   for (@PH) {
      $i++;
      my @chars= split /  /;
      $base1[$i] = $chars[0];
      $q_base1[$i] = $chars[3];
      $base2[$i] = $chars[4];
      $q_base2[$i] = $chars[7];
      if ($base2[$i] eq "N") {
         $base2[$i] = $base1[$i];
         $q_base2[$i] = $q_base1[$i];
      }
   }
   close PH;
   my %result = (
      base1Ref => \@base1,
      base2Ref => \@base2,
   );
   return \%result;
} #   Read_Trace ends

#--------------------------------------------------------------------
sub Align
#--------------------------------------------------------------------
{
   my ($base1Ref,$EXPRef,$startdel) = @_;
   my $base_sign1=$EXPRef->[$startdel-4].$EXPRef->[$startdel-3].$EXPRef->[$startdel-2].$EXPRef->[$startdel-1].$EXPRef->[$startdel];
   my $base_sign2=$EXPRef->[$startdel-9].$EXPRef->[$startdel-8].$EXPRef->[$startdel-7].$EXPRef->[$startdel-6].$EXPRef->[$startdel-5];
   my $base_sign3=$EXPRef->[$startdel-14].$EXPRef->[$startdel-13].$EXPRef->[$startdel-12].$EXPRef->[$startdel-11].$EXPRef->[$startdel-10];
   my $base_sign4=$EXPRef->[$startdel-19].$EXPRef->[$startdel-18].$EXPRef->[$startdel-17].$EXPRef->[$startdel-16].$EXPRef->[$startdel-15];
   my @matches1;
   my @matches2;
   my @matches3;
   my @matches4;
   $matches1[1]=0;
   $matches2[1]=0;
   $matches3[1]=0;
   $matches4[1]=0;
   my $match1=0;
   my $match2=0;
   my $match3=0;
   my $match4=0;
   my $matched_base=20;
   while (<@$base1Ref>) {
      $matched_base++;
      my $trace_sign = $base1Ref->[$matched_base-4].$base1Ref->[$matched_base-3].$base1Ref->[$matched_base-2].$base1Ref->[$matched_base-1].$base1Ref->[$matched_base];
      if ($base_sign1 eq $trace_sign) {
         $match1++;
         $matches1[$match1]=$matched_base;
      }
      if ($base_sign2 eq $trace_sign) {
         $match2++;
         $matches2[$match2]=$matched_base+5;
      }
      if ($base_sign3 eq $trace_sign) {
         $match3++;
         $matches3[$match3]=$matched_base+10;
      }
      if ($base_sign4 eq $trace_sign) {
         $match4++;
         $matches4[$match4]=$matched_base+15;
      }
   }

   #calculate score
   if ($matches1[1]+$matches2[1]+$matches3[1]+$matches4[1] == 0) {
      print "unable to locate the deletion site in the trace file\n";
      die;
   }
   my $score=0;
   my $bestmatch=0;
   my $bestscore=0;
   while (<@matches1>) {
      my $currentmatch=$_;
      $score=40;
      while (<@matches2>) {
         $score=$score+30/(abs($_-$currentmatch)+0.1);
      }
      while (<@matches3>) {
         $score=$score+20/(abs($_-$currentmatch)+0.1);
      }
      while (<@matches4>) {
         $score=$score+10/(abs($_-$currentmatch)+0.1);
      }
      if ($score > $bestscore) {
         $bestscore=$score;
         $bestmatch = $currentmatch;
      }
   }

   while (<@matches2>) {
      my $currentmatch=$_;
      $score=30;
      while (<@matches3>) {
         $score=$score+20/(abs($_-$currentmatch)+0.1);
      }
      while (<@matches4>) {
         $score=$score+10/(abs($_-$currentmatch)+0.1);
      }
      if ($score > $bestscore) {
         $bestmatch = $currentmatch;
      }
   }

   while (<@matches3>) {
      my $currentmatch=$_;
      $score=20;
      while (<@matches4>) {
         $score=$score+10/(abs($_-$currentmatch)+0.1);
      }
      if ($score > $bestscore) {
         $bestmatch = $currentmatch;
      }
   }

   while (<@matches4>) {
      my $currentmatch=$_;
      $score=10;
      if ($score > $bestscore) {
         $bestmatch = $currentmatch;
      }
   }
   print "In the trace file the deletion site detected at position $bestmatch, score ";
   printf("%.1f",$bestscore);
   print " (lower than 600 may indicate low trace file quality)\n";
 
   return $bestmatch;
}

#--------------------------------------------------------------------
sub GenotypeSNP
#--------------------------------------------------------------------
{
   my ($type, $EXPRef, $base1Ref, $base2Ref, $SNPBaseRef, $align, $SNP, $AllelesA, $AllelesB, $shift_size) = @_;
   #$type == "Ins" or "Del"
   print "Finding the SNP$SNP allele that is located on the $type chromosome\n";
   my $inverseType = $type eq "Ins" ? "Del" : "Ins";
   my $genotypeSNP = 'N';
   my $traceSNPbase = $SNPBaseRef->[$SNP] + $align;
   if ($type eq "Del") {
      $shift_size = -$shift_size;
      $traceSNPbase += $shift_size;
   }
   my $delbase = $EXPRef->[$SNPBaseRef->[$SNP]+$shift_size];

   print "The expected pair of bases before the SNP site is $EXPRef->[$SNPBaseRef->[$SNP]+$shift_size-1] $EXPRef->[$SNPBaseRef->[$SNP]-1]\n";
   print "In the trace, the pairs of bases are:\n";
   for (my $i = -2; $i <= 2; $i++) {
      my $base = $traceSNPbase + $i;
      print "$base base1 $base1Ref->[$base] base2 $base2Ref->[$base]";
      if ($i == 0) {
         print " are in the expected SNP site";
      }
      print "\n";
   }

   #check validity
   my $base_beforeEXP = ord($EXPRef->[$SNPBaseRef->[$SNP]+$shift_size-1]) + ord($EXPRef->[$SNPBaseRef->[$SNP]-1]);
   my $base_beforeTrace = ord($base1Ref->[$traceSNPbase-1])+ord($base2Ref->[$traceSNPbase-1]);
   my $validflag = 0;
   if ($base_beforeEXP != $base_beforeTrace) {
      $validflag = 10;
      my $bibase_beforeTrace = ord($base1Ref->[$traceSNPbase-2]) + ord($base2Ref->[$traceSNPbase-2]);
      my $base_afterTrace = ord($base1Ref->[$traceSNPbase]) + ord($base2Ref->[$traceSNPbase]);
      if ($bibase_beforeTrace != $base_afterTrace) {
         if ($base_beforeEXP == $bibase_beforeTrace) {
            $validflag = -1;
         }
         if ($base_beforeEXP == $base_afterTrace) {
            $validflag = 1;
         }
      }
   }
   if ($validflag == 10) {
      print "Haplotype 1 is: can not align sequences\n";
   } else {
      $traceSNPbase += $validflag;
      print "The pair of bases that should precede the site of SNP on the $type chromosome was located at $validflag bases from the expected position \n";
      print "hence, the site of SNP for the $type chromosome is at position $traceSNPbase base1 $base1Ref->[$traceSNPbase] base2 $base2Ref->[$traceSNPbase] of these the expected base for the $inverseType chromosome is $delbase\n";
      if ($base1Ref->[$traceSNPbase] eq $delbase) {
         $genotypeSNP = $base2Ref->[$traceSNPbase];
      } else {
         $genotypeSNP = $base1Ref->[$traceSNPbase];
      }
      if ($genotypeSNP eq $AllelesA or $genotypeSNP eq $AllelesB) {
         if ($type eq "Ins") {
            print "Haplotype 2 is: Ins, $genotypeSNP\n";
         } else {
            print "Haplotype 1 is: Del, $genotypeSNP\n";
            $EXPRef->[$SNPBaseRef->[$SNP]]=$genotypeSNP;
         }
      } else {
         if ($type eq "Ins") {
            print "Haplotype 2 is: can not align sequences\n";
         } else {
            print "Haplotype 1 is: can not align sequences\n";
         }
      }
   }
   print "______________________________________\n";
}