#!/usr/bin/perl # # Ver 1.1 by # Yanir Seroussi and # Eyal Seroussi <seroussi@agri.huji.ac.il> # use strict; use warnings; use Bio::Trace::ABIF; use GD::Graph::lines; use List::Util qw(min max); use File::Copy; use constant { ARGV_EXPECTED_LENGTH => 4, FIRST_TRACE_INDEX => 0, B_TRACE_INDEX => 0, G_TRACE_INDEX => 1, Y_TRACE_INDEX => 2, R_TRACE_INDEX => 3, O_TRACE_INDEX => 4, MODIFIED_FILE_SUFFIX => '_M.fsa', INVALID_SCORE => -1 }; main(); sub main { if (ARGV_EXPECTED_LENGTH > scalar @ARGV) { printUsageMessage(); return 0; } my ($refFile, $testFile, $system, $optimalSignal, $createHtml) = @ARGV; if (!defined($createHtml)) { $createHtml = 0; } print "processing $refFile, $testFile files, system $system, optimal signal $optimalSignal\n"; # Extract and normalize the data from the chromatogram files. Get it as a hash reference. my $refData = readTraceFromFile($refFile); my $testData = readTraceFromFile($testFile); my $usedFifthDye = updateFifthDyeField($refData, $testData); my $refStart = normalizeTrace($refData, $system, $optimalSignal); normalizeTrace($testData, $system, $optimalSignal, $refStart); # Now align test trace to reference trace. Reference trace does not change. if ('25PLEX' eq $system) { align($refData, $testData, 0, $testData->{seqLastIndex}); } else { getBestAlignment($refData, $testData); } # Calculate differences (and $difscale) between traces for first time to estimate scale my ($alignmentLength, $diffs, $difscale) = differences($refData, $testData, $optimalSignal, 1); # Calculate differences between traces. ($alignmentLength, $diffs, $difscale) = differences($refData, $testData, $optimalSignal, $difscale); # Generate output images. my $refImage = $refFile . "ref_image.png"; my $testImage = $refFile . "test_image.png"; my $diffImage = $refFile . "diff_image.png"; generateImage($refData, 0, 2000, 200, $refImage, $alignmentLength, $system); generateImage($testData, 0, 2000, 200, $testImage, $alignmentLength, $system); generateImage($diffs, -500, 500, 300, $diffImage, $alignmentLength, $system); # Create HTML and modified file. if ($createHtml) { generateHTMLFile($refImage, $testImage, $diffImage); } if ($usedFifthDye) { generateModifiedFile($refData, $testData, $testFile, $alignmentLength); } return 0; } sub printUsageMessage { print "SNPmplexViewer\n", "=============\n", "Usage: $0 <Reference_trace> <Test_trace> <System> <Normalize> <GenerateHTML>\n\n", "Aligns two trace files created by ABI.\n", "Transfers the fifth-dye channel of the reference to a newly formed normalized and aligned Test_trace_M.fsa file.\n", "Output includes align.html with 3 figures in PNG format\n", "of chromatograms of the input files and a visualization of the differences between them.\n\n", "<System> may be 25PLEX or OTHER.\n", "<Normalize>*100 is the signal strength that the program will normalize to (e.g. 1=100, 2=200)\n"; "<GenerateHTML> should be 0 for no HTML file generation, 1 otherwise\n"; } sub generateHTMLFile { my ($refImage, $testImage, $diffImage) = @_; open(ALIGNHTML, ">align.html" ) or die $!; print ALIGNHTML ( "<html>\n", " <body>\n", " <img src=$refImage>\n", " <img src=$diffImage>\n", " <img src=$testImage>\n", " </body>\n", "</html>\n"); close ALIGNHTML; } sub updateFifthDyeField { my ($refData, $testData) = @_; my $fifthDye = $refData->{fifthDye} && $testData->{fifthDye}; for my $traceData ($refData, $testData) { $traceData->{fifthDye} = $fifthDye; $traceData->{lastChannelIndex} = ($fifthDye ? O_TRACE_INDEX : R_TRACE_INDEX); } return $fifthDye; } sub readTraceFromFile { my ($file) = @_; # Create a new ABIF object pointing to that file. my $abif = Bio::Trace::ABIF->new( -file => $file ) or die('abi_format'); $abif->open_abif($file); # Get the sample name and sequencer model from the file. my $model = $abif->model_number() or die('trace_error model_number'); print "This file originated from $model sequencer.\n"; # Now get the traces. my @traceMatrix; for my $i (FIRST_TRACE_INDEX .. R_TRACE_INDEX) { $traceMatrix[$i] = [ $abif->get_data_item('DATA', $i + 1, 'n*') ]; if (!scalar @{$traceMatrix[$i]}) { die('trace_error ' . ($i + 1)); } } # Get the fifth trace. $traceMatrix[O_TRACE_INDEX] = [ $abif->get_data_item('DATA', 105, 'n*') ]; $abif->close_abif($file); return { traceMatrix => \@traceMatrix, fifthDye => scalar @{$traceMatrix[O_TRACE_INDEX]}, model => $model, seqLastIndex => $#{$traceMatrix[FIRST_TRACE_INDEX]} }; } sub removeControlNumbers { my ($traceData) = @_; my %modelRemovalLevels = ( 3100 => 5000, 3730 => 20000 ); my $removalLevel = $modelRemovalLevels{$traceData->{model}}; if (!defined $removalLevel) { return; } for my $i (FIRST_TRACE_INDEX .. $traceData->{lastChannelIndex}) { for my $j (0 .. $traceData->{seqLastIndex}) { if ($traceData->{traceMatrix}->[$i][$j] > $removalLevel) { $traceData->{traceMatrix}->[$i][$j] = 0; } } } } sub getModelMatrix { # If the model is 310 use a special matrix, otherwise the sequencer has already # taken into account the overlap in the dye spectrum. my ($model) = @_; return (310 == $model or 377 == $model ) ? [[ 1.05, -0.124, 0, 0, -0.002 ], [ -0.65, 1.090, -0.07, 0, 0 ], [ -0.3, -0.619, 1.074, -0.049, 0 ], [ -0.1, -0.3, -0.7, 1.05, -0.053 ], [ -0.01, -0.05, -0.25, -0.275, 1.011 ]] : [[ 1, 0, 0, 0, 0 ], [ 0, 1, 0, 0, 0 ], [ 0, 0, 1, 0, 0 ], [ 0, 0, 0, 1, 0 ], [ 0, 0, 0, 0, 1 ]]; } sub calcNoise { my ($traceData) = @_; my $modelMatrix = getModelMatrix($traceData->{model}); # Estimate noise to be the avarage signal of all points (Avarage Signal -AS). # Correct the noise estimate to be the mean signal of all points that are below # the avarage signal of all points (Avarage Noise -AN). # The avarage data signal (ADS) is calculated according to the difference # between AS and AN. my @noise = (0, 0, 0, 0, 0); my @corNoise = (0, 0, 0, 0, 0); my @noisePoints = (0, 0, 0, 0, 0); my @signal = (1, 1, 1, 1, 1); for my $i (FIRST_TRACE_INDEX .. $traceData->{lastChannelIndex}) { for my $j (0 .. $traceData->{seqLastIndex}) { my $value = 0; for my $k (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { $value += $traceData->{traceMatrix}->[$k][$j] * $modelMatrix->[$i][$k]; } $traceData->{traceMatrix}->[$i][$j] = $value; $noise[$i] += $value; } $noise[$i] = $noise[$i] / $traceData->{seqLastIndex} * 1.05; for my $j (0 .. $traceData->{seqLastIndex}) { if ($traceData->{traceMatrix}->[$i][$j] < $noise[$i]) { $corNoise[$i] += $traceData->{traceMatrix}->[$i][$j]; $noisePoints[$i]++; } } if ($noisePoints[$i] > 0) { $corNoise[$i] /= $noisePoints[$i]; } if ($traceData->{seqLastIndex} == $noisePoints[$i]) { $signal[$i] = 0; } if ($signal[$i] == 1) { $signal[$i] = ($noise[$i] * $traceData->{seqLastIndex} - $corNoise[$i] * $noisePoints[$i]) / ($traceData->{seqLastIndex} - $noisePoints[$i]); } } return \@corNoise, \@signal; } sub cutTrace { my ($traceData, $system, $peaks) = @_; my $start; if ($system eq '25PLEX') { # Evaluating start of data according to 25plex SNP25 $start = max(0, ($peaks->[Y_TRACE_INDEX] < $peaks->[B_TRACE_INDEX] && $peaks->[Y_TRACE_INDEX] < $peaks->[G_TRACE_INDEX] && $peaks->[Y_TRACE_INDEX] < $peaks->[R_TRACE_INDEX]) ? $peaks->[Y_TRACE_INDEX] - 190 : $peaks->[R_TRACE_INDEX] - 230); print "b peak at $peaks->[B_TRACE_INDEX], g peak at $peaks->[G_TRACE_INDEX], ", "y peak at $peaks->[Y_TRACE_INDEX], r peak at $peaks->[R_TRACE_INDEX], start $start\n"; # Remove blank sequence from the beginning of the trace for my $i (0 .. $traceData->{lastChannelIndex}) { splice(@{$traceData->{traceMatrix}->[$i]}, 0, $start + 1); } } else { # OTHER - start of data at the first b peak $start = $peaks->[B_TRACE_INDEX]; #OTHER- remove blank sequence from the begining of trace for my $i (0 .. $traceData->{lastChannelIndex}) { splice(@{$traceData->{traceMatrix}->[$i]}, 0, $start + 1); } print "b peak at $peaks->[B_TRACE_INDEX], g peak at $peaks->[G_TRACE_INDEX], ", "y peak at $peaks->[Y_TRACE_INDEX], r peak at $peaks->[R_TRACE_INDEX], start $start\n"; # remove blank sequence from end of trace. while ($traceData->{traceMatrix}->[B_TRACE_INDEX][-1] < 50 && $traceData->{traceMatrix}->[G_TRACE_INDEX][-1] < 50 && $traceData->{traceMatrix}->[Y_TRACE_INDEX][-1] < 50 && $traceData->{traceMatrix}->[R_TRACE_INDEX][-1] < 50) { for my $i (FIRST_TRACE_INDEX .. $traceData->{lastChannelIndex}) { pop(@{$traceData->{traceMatrix}->[$i]}); } } } $traceData->{seqLastIndex} = $#{$traceData->{traceMatrix}->[FIRST_TRACE_INDEX]}; return $start; } sub normalizePeaks { my ($traceData, $corNoise, $signal, $optimalSignal) = @_; for my $i (FIRST_TRACE_INDEX .. $traceData->{lastChannelIndex}) { for my $j (0 .. $traceData->{seqLastIndex}) { # If there is a peak normalize and record, otherwise the value is set to 0. if (($traceData->{traceMatrix}->[$i][$j] > $corNoise->[$i]) && ($signal->[$i] > 0)) { $traceData->{traceMatrix}->[$i][$j] = max(100 * $optimalSignal * ( $traceData->{traceMatrix}->[$i][$j] - $corNoise->[$i] ) / $signal->[$i], 0); } else { $traceData->{traceMatrix}->[$i][$j] = 0; } } } } sub detectPeaks { my ($traceData, $optimalSignal, $start) = @_; # Normalize to 100 and record first data peaks. my @peaks = (0, 0, 0, 0, 0); my @base = (0, 0, 0, 0, 0); # Detecting the first peak for each dye. my $dtctpk_start = 0; # if it is run of the target, direct the search for the start site to the region of start site in the reference $start-100 if (defined($start)) { $dtctpk_start = $start - 200; } my @optimalMultipliers = (40, 40, 20, 20); for my $i (FIRST_TRACE_INDEX .. R_TRACE_INDEX) { for my $j ($dtctpk_start .. $traceData->{seqLastIndex}) { if ($peaks[$i] == 0 && $j > 10 && $traceData->{traceMatrix}->[$i][$j] > $optimalMultipliers[$i] * $optimalSignal) { $base[$i] = 3 / $optimalSignal * $traceData->{traceMatrix}->[$i][$j - 9]; my $isLargerThanOthers = 1; for my $k (FIRST_TRACE_INDEX .. $i - 1, $i + 1 .. R_TRACE_INDEX) { if ($traceData->{traceMatrix}->[$i][$j] <= $traceData->{traceMatrix}->[$k][$j]) { $isLargerThanOthers = 0; } } if ($isLargerThanOthers && $traceData->{traceMatrix}->[$i][$j] > $base[$i]) { $peaks[$i] = $j; } } } } return \@peaks; } sub zeroizeFifthDye { my ($traceData) = @_; if (!$traceData->{fifthDye}) { for my $j (0 .. $traceData->{seqLastIndex}) { $traceData->{traceMatrix}->[O_TRACE_INDEX][$j]= 0; } } } sub normalizeTrace { my ($traceData, $system, $optimalSignal, $start) = @_; zeroizeFifthDye($traceData); removeControlNumbers($traceData); my ($corNoise, $signal) = calcNoise($traceData); normalizePeaks($traceData, $corNoise, $signal, $optimalSignal); my $peaks = detectPeaks($traceData, $optimalSignal, $start); return cutTrace($traceData, $system, $peaks); } sub getBestAlignment { # This does an alignment of the first 1000 datapoints using a range of offsets # from -200 to 200. The best alignment is picked on the basis of having the # lowest score from datapoint 200 to 1000, and is used to allow for any # variation in start position of the two sequences. my ($ref, $test) = @_; my ($lowestScore, $lowestScoreOffset) = (INVALID_SCORE, 0); for (my $offset = -200; $offset <= 200; $offset += 20) { # Create a temporary copy of the test hash, since align may modify it. # A deep-copy is necessary, since the original hash contains references # that we don't want to modify. my (%tempTest); for my $i (FIRST_TRACE_INDEX..O_TRACE_INDEX) { my @tempTestTrace = @{$test->{traceMatrix}[$i]}; $tempTest{traceMatrix}[$i] = \@tempTestTrace; } # Do a partial alignment. align($ref, \%tempTest, $offset, 1000); # Work out the score for that alignment. my $score = getScore(200, 1000, 0, $ref, \%tempTest); if ((INVALID_SCORE != $score) && ((INVALID_SCORE == $lowestScore) || ($score < $lowestScore))) { ($lowestScore, $lowestScoreOffset) = ($score, $offset); } } # Do the best alignment. align($ref, $test, $lowestScoreOffset, $test->{seqLastIndex} + $lowestScoreOffset); } sub align { # This takes the normalized traces and returns a best alignment of the two. # Rows are added to or deleted from the test trace to keep the alignment. # Best alignment is calculated by minimising the difference between the test # and reference sequence over the next 15 datapoints. It is adjusted every five # bases. Inserted lines are just a duplicate of the previous line. my ($ref, $test, $minIndex, $traceLength) = @_; # Add/delete the appropriate number of lines to the test sequence to correct # for the offset value if ($minIndex < 0) { # Need to add lines to test sequence, since it's behind. for my $index ($minIndex .. 0) { for my $i (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { unshift(@{$test->{traceMatrix}[$i]}, $test->{traceMatrix}[$i][0]); } } } elsif ($minIndex > 0) { # Need to delete lines from test sequence, since it's ahead. for my $i (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { splice(@{$test->{traceMatrix}[$i]}, 0, $minIndex); } } # Now check alignments (for each third entry). for (my $index = 0; $index < $traceLength; $index += 3) { my $startPos = $index; my $endPos = $index + 15; # Compare the scores in the current alignment with those one data point in # either direction. my $score = getScore($startPos, $endPos, 0, $ref, $test); my $preScore = getScore($startPos, $endPos, -1, $ref, $test); my $postScore = getScore($startPos, $endPos, 1, $ref, $test); if (INVALID_SCORE == $score || INVALID_SCORE == $postScore || INVALID_SCORE == $preScore) { last; } # Now insert or delete lines as required. if ($postScore < $score && $postScore < $preScore) { # The reference sample is behind, need to delete a row from test. for my $i (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { splice(@{$test->{traceMatrix}[$i]}, $index, 1); } } elsif ($preScore < $score && $preScore < $postScore) { # The reference sample is ahead, need to add a row to test. for my $i (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { splice(@{$test->{traceMatrix}[$i]}, $index, 0, $test->{traceMatrix}[$i][$index]); } } } # Reset length of test sequence to correct value. $test->{seqLastIndex} = $#{$test->{traceMatrix}[FIRST_TRACE_INDEX]}; } sub getScore { # Subroutine used in alignment testing - it gets the total difference between # the two submitted sections of array and returns it. my ($start, $end, $offset, $ref, $test) = @_; if (!defined($ref->{traceMatrix}[FIRST_TRACE_INDEX][$end]) || !defined($test->{traceMatrix}[FIRST_TRACE_INDEX][$end + $offset])) { return INVALID_SCORE; } my $score = 0; for my $j ($start .. $end) { for my $i (FIRST_TRACE_INDEX .. O_TRACE_INDEX) { $score += abs($ref->{traceMatrix}[$i][$j] - $test->{traceMatrix}[$i][$j + $offset]); } } return $score; } sub cutDiff { # Stop saturation by cutting off to a max value. my ($diff, $useSign) = @_; my $sign = $useSign ? sign($diff) : 1; my $maxDiff = 500; if (abs($diff) > $maxDiff) { $diff = $sign * $maxDiff; } return $diff; } sub sign { my ($num) = @_; return $num < 0 ? -1 : 1; } sub absoluteDifferences { my ($ref, $test) = @_; my $minIndex = min($ref->{seqLastIndex}, $test->{seqLastIndex}); my %diffs; for my $i (FIRST_TRACE_INDEX..O_TRACE_INDEX) { for my $j (0 .. $minIndex) { $diffs{traceMatrix}[$i][$j] = cutDiff($ref->{traceMatrix}[$i][$j] - $test->{traceMatrix}[$i][$j], 0); } } return $minIndex, \%diffs; } sub realDifferences { my ($minIndex, $diffs, $ref, $test, $optimalSignal, $diffScale) = @_; for my $j (0 .. $minIndex) { for my $i (FIRST_TRACE_INDEX..O_TRACE_INDEX) { my $diff = $diffs->{traceMatrix}[$i][$j]; my $sign = sign($diff); # Sum all values in the other channels which have the opposite sign. my $otherChannels = 1; for my $k (FIRST_TRACE_INDEX..$i-1, $i+1..O_TRACE_INDEX) { if (sign($diffs->{traceMatrix}[$k][$j]) != $sign) { $otherChannels += $diffs->{traceMatrix}[$k][$j];} } $diff = ($sign * ($diff ** 2) * sqrt(abs($otherChannels)))/$diffScale/($optimalSignal ** 2); $diffScale = max($diffScale, $diff); $diffs->{traceMatrix}[$i][$j] = cutDiff($diff, 1); } } return $minIndex, $diffs, $diffScale; } sub differences { # Takes the two traces and calculates the difference between the two. Then # squares this difference to highlight the larger (relevent) changes. # Also returns the length of the shortest sequence, so both are aligned on # image generation. my ($ref, $test, $optimalSignal, $diffScale) = @_; my ($minIndex, $diffs) = absoluteDifferences($ref, $test); return realDifferences($minIndex, $diffs, $ref, $test, $optimalSignal, $diffScale); } sub generateImage { # Routine to convert the supplied hash reference into a png image of a graph. my ($trace, $min, $max, $size, $outputFile, $length, $system) = @_; my @labels; for my $i (0 .. $length) { push(@labels, ''); } my @plotData = (\@labels, @{$trace->{traceMatrix}}); #color schemes my $col1='blue'; my$col2='green'; my $col3='black'; my $col4='red'; my $col5='orange'; if ('OTHER' eq $system) {$col1='black'; $col3='red'; $col4='blue'; $col5='orange';} # Define graph format. my $mygraph = GD::Graph::lines->new($length, $size); $mygraph->set( x_label => '', x_ticks => 0, y_label => '', y_max_value => $max / 4, y_min_value => $min / 4, # Draw datasets in 'solid', 'dasSV.plhed' and 'dotted-dashed' lines. line_types => [ 1, 1, 1, 1, 1 ], # Set the thickness of line. line_width => 1, # Set colors for datasets. dclrs => [ $col1, $col2, $col3, $col4, $col5], ) or warn $mygraph->error; my $myimage = $mygraph->plot( \@plotData ) or die $mygraph->error; open(OUT, ">$outputFile") or die $!; binmode OUT; print OUT $myimage->png; close OUT; } sub writeTagOrDie { my ($abif, $tagName, $tagNum, $data) = @_; $abif->write_tag($tagName, $tagNum, $data) or die("write_tag($tagName, $tagNum) failed!"); } sub generateModifiedFile { my ($hashRef, $hashTest, $testFile, $minLength) = @_; my $newTrace = $testFile . MODIFIED_FILE_SUFFIX; copy($testFile, $newTrace) or die("File $testFile cannot be copied."); my $abif = Bio::Trace::ABIF->new(-file => $newTrace) or die('abi_format'); $abif->open_abif($newTrace, 1); # Modify traces. for my $index (0 .. $minLength) { my $corOref = $hashRef->{traceMatrix}[O_TRACE_INDEX][$index]; if ($corOref < 5) { $corOref = 0; } $corOref += $hashTest->{traceMatrix}[O_TRACE_INDEX][$index]; $hashRef->{traceMatrix}[O_TRACE_INDEX][$index] = $corOref * 4; } splice(@{$hashRef->{traceMatrix}[O_TRACE_INDEX]}, $minLength - 1); writeTagOrDie($abif, 'DATA', 105, $hashRef->{traceMatrix}[O_TRACE_INDEX]); if ($hashTest->{model} eq 310) { my $pure = [ 1000, 0, 0, 0, 0, 0, 1000, 0, 0, 0, 0, 0, 1000, 0, 0, 0, 0, 0, 1000, 0, 0, 0, 0, 0, 1000 ]; writeTagOrDie($abif, 'MTRX', 4, $pure); } for my $i (FIRST_TRACE_INDEX..R_TRACE_INDEX) { for my $j (0 .. $minLength) { $hashTest->{traceMatrix}[$i][$j] *= 3; } splice(@{$hashTest->{traceMatrix}[$i]}, $minLength - 1); writeTagOrDie($abif, 'DATA', $i + 1, $hashTest->{traceMatrix}[$i]); } $abif->close_abif($newTrace); }