Mercurial > repos > aaronpetkau > assemblystats
comparison fasta_summary.pl @ 0:8d1c7f2a3f5c draft default tip
Uploaded
| author | aaronpetkau |
|---|---|
| date | Sat, 04 Jul 2015 09:43:13 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8d1c7f2a3f5c |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 #============================================================================================== | |
| 4 | |
| 5 # Script to output statistsics and histograms for reads and contigs/isotigs | |
| 6 | |
| 7 | |
| 8 # Outputs include: | |
| 9 # Mean, N50, StdDev or reads or contig lengths, | |
| 10 # Mean and Modal read or contig lengths. | |
| 11 # Number of reads or contigs > 1 kb in length | |
| 12 # Summed contig length (by number of contigs, in sorted order) | |
| 13 # Histogram of read or contig lengths, | |
| 14 # Graph of sums of read-lengths | |
| 15 # File of reads or contigs sorted by read or contig length | |
| 16 # Test for mono/di-nucelotide repeats | |
| 17 # Randomly selected reads or contigs | |
| 18 | |
| 19 | |
| 20 # Needs gnuplot installed to create the histograms: | |
| 21 # On Fedora/Redhat linux: sudo yum install gnuplot | |
| 22 # On Ubuntu/Debian: sudo apt-get install gnuplot | |
| 23 | |
| 24 # Uses a linux pipe to call gnu-plot directly, rather than as a separate shell script. | |
| 25 | |
| 26 # Original written by Sujai Kumar, 2008-09-05 University of Edinburgh | |
| 27 # Modified by Stephen: 29-Apr-2009: | |
| 28 # Last changed by Stephen: 9-Aug-2010 | |
| 29 | |
| 30 | |
| 31 # Usage: fasta_summary.pl -i infile.fasta -o process_reads -t read OR contig OR isotig (to use 'read' or 'contig' or 'isotig' in the output table & graphs. Isotig is for 'runAssembly -cdna ...' output file '454Isotigs.fna') [-r 1 to indicate count simple nucleotide repeats] [-n number of random reads to output] [-c cutoff_length] [-l 1 to indicate output the longest read] [-f (s or t or w) for spacer, tab or wiki format table output.] | |
| 32 | |
| 33 # Note: The parameters above in the [] are optional. | |
| 34 | |
| 35 # eg: fasta_summary.pl -i myfile.fasta -o process_reads -t read | |
| 36 # Where: | |
| 37 # -i reads or contigs as input, in fasta format. | |
| 38 # -o output_dir (created if it doesn't exist) | |
| 39 # -t read, contig or isotig | |
| 40 | |
| 41 # Gives back | |
| 42 # - N50 | |
| 43 # - num of contigs > 1 kb | |
| 44 # - num of contigs | |
| 45 # - Read or Contig Histogram and graphs. | |
| 46 # - Summed contig length (by number of contigs, in sorted order) | |
| 47 | |
| 48 #============================================================================================== | |
| 49 | |
| 50 | |
| 51 use strict; | |
| 52 use warnings; | |
| 53 use Getopt::Long; | |
| 54 | |
| 55 my $infile; | |
| 56 my $output_dir; | |
| 57 my $type='read'; # Defaults to 'read' at present | |
| 58 my $repeats=1; | |
| 59 my $num_random_reads_to_output=0; | |
| 60 my $cutoff_length=-1; # -1 means won't check this cutoff | |
| 61 my $longest_read=-1; # -1 mean's don't output the sequence for the longest read. | |
| 62 my $doCommify=1; # Outputs statistics numbers in format: 9,999,999 | |
| 63 my $format="t"; # "s"=spaces between columns, "t"=tabs between columns, "w"=wiki '||' and '|'. | |
| 64 my $bucket1=0; # For optional exact length histogram distribution as asked for by JH. | |
| 65 | |
| 66 if ($#ARGV==-1) {die " | |
| 67 Usage: | |
| 68 | |
| 69 fasta_summary.pl -i infile.fasta -o output_dir -t ( read | contig | isotig ) [ -r 0 ] [ -n num_reads ] [ -c cutoff_length ] [ -l 1 ] [ -d 0 ] [ -f (w | t ) ] [ -bucket1 ] | |
| 70 | |
| 71 where: | |
| 72 | |
| 73 -i or -infile infile.fasta : input fatsa file of raeds, contigs or isotigs, | |
| 74 | |
| 75 -o or -output_dir output_directory : directory to put output stats and graphs into. | |
| 76 | |
| 77 -t or -type (read or contig or isotig) : for displaying the graph title, where type is 'read' or 'contig' or 'isotig'. | |
| 78 | |
| 79 -r or -repeats 0 or 1 : 1=count number of reads that contain over 70% simple mono-nucleotide and di-nucleotide repeat bases; 0=don't count. | |
| 80 | |
| 81 -n or -number num_reads : For outputting specified number of randomly selected reads or contigs. | |
| 82 | |
| 83 -c or -cutoff cutoff_length : Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length) | |
| 84 | |
| 85 -l or -longest 0 or 1 : 1=Output the longest read; 0= don't output the longest read | |
| 86 | |
| 87 -d or -doCommify 0 or 1 : Output numbers formatted with commas to make easier to read: 0=no commas, default=1 | |
| 88 | |
| 89 -f or -format w or t : w=wiki_format (ie. table with || and | for column dividers), t=tabs between column symbols for the wiki pages, default is spaces between columns. | |
| 90 | |
| 91 -b or -bucket1 : To also output histogram file of exact read lengths (ie. bucket size of 1) | |
| 92 | |
| 93 | |
| 94 eg: For 454 reads: fasta_summary.pl -i RawReads.fna -o read_stats -t read | |
| 95 For 454 isotigs: fasta_summary.pl -i 454Isotigs.fna -o isotig_stats -t isotig | |
| 96 | |
| 97 ";} | |
| 98 | |
| 99 GetOptions ( | |
| 100 "infile=s" => \$infile, | |
| 101 "output_dir=s" => \$output_dir, | |
| 102 "type=s" => \$type, ## type is 'read' or 'contig' or 'isotig' - for displaying the graph title | |
| 103 "repeats=i" => \$repeats, # To count simple repeats | |
| 104 "number=i" => \$num_random_reads_to_output, # For outputting specified number of random reads | |
| 105 "cutoff=i" => \$cutoff_length, # Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length) | |
| 106 "longest=i" => \$longest_read, # Output the longest read. | |
| 107 "doCommify=i" => \$doCommify, # Output numbers formatted with commas to make easier to read: 0=no commas, default=1 | |
| 108 "format=s" => \$format, # "w"=wiki_format (ie. table with || and | for column dividers), "t"=tabs between column symbols for the wiki pages, default is spaces between columns. | |
| 109 "bucket1" => \$bucket1, # To also output histogram file of exact read lengths (ie. bucket size of 1) | |
| 110 ); | |
| 111 if ($#ARGV>-1) {die "Unused options specified: @ARGV\n";} | |
| 112 if ( (! defined $infile) || ($infile eq '') ) {die "\nPlease give input fasta file, preceeded by the '-i' option\n\n";} | |
| 113 if ( (! defined $output_dir) || ($output_dir eq '') ) {die "Please give output_directory, preceeded by the '-o' option\n\n";} | |
| 114 if ( (! defined $type) || (($type ne 'contig') && ($type ne 'read') && ($type ne 'isotig')) ) {die "ERROR: On commandline: -t type must be 'contig' or 'read' or 'isotig'\n\n";} | |
| 115 | |
| 116 | |
| 117 my ($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab); | |
| 118 if ($format eq 's') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('',' ','', '',' ','', "\n",'');} | |
| 119 elsif ($format eq 't') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=("\t","\t",'', "","\t",'', "\n",'');} # There is correctly a tab for the $L, but not the $Lh. | |
| 120 elsif ($format eq 'w') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('| ',' | ',' |', '|| ',' || ',' ||', '|| ',' || ');} | |
| 121 else {die "\nInvalid output format code: '$format'. Should be 's', 't' or 'w'.\n\n";} | |
| 122 | |
| 123 ### create output_dir if it doesn't exist | |
| 124 if (-d $output_dir) { | |
| 125 print STDERR " Directory '$output_dir' exists, so the existing fasta_summary.pl output files will be overwritten\n"; | |
| 126 } else { | |
| 127 mkdir $output_dir; | |
| 128 print STDERR " Directory '$output_dir' created\n"; | |
| 129 } | |
| 130 | |
| 131 my $gc_count = 0; | |
| 132 | |
| 133 #--------------- Read in contigs from fasta file ------------------- | |
| 134 | |
| 135 open INFILE, "<$infile" or die "Failed to open file: '$infile' : $!"; | |
| 136 open STATS, ">$output_dir/stats.txt" or die "Failed to open $output_dir/stats.txt: '' : $!"; | |
| 137 | |
| 138 my $header = <INFILE>; | |
| 139 if (! defined($header) ) {print "\n** ERROR: First line of input fasta file is undefined - so file must be empty **\n\n"; print STATS "No sequences found\n"; exit 1;} | |
| 140 if ($header!~/^>/) {print "\nERROR: First line of input fasta file should start with '>' BUT first line is: '$header'\n"; print STATS "No sequences found\n"; exit 1;} | |
| 141 | |
| 142 my $seq = ""; | |
| 143 my @sequences; | |
| 144 | |
| 145 while (<INFILE>) { | |
| 146 if (/^>/) { | |
| 147 push @sequences, [length($seq), $header, $seq]; | |
| 148 $header = $_; | |
| 149 $seq = ""; | |
| 150 } else { | |
| 151 chomp; | |
| 152 $gc_count += tr/gcGC/gcGC/; | |
| 153 $seq .= $_; | |
| 154 } | |
| 155 } | |
| 156 push @sequences, [length($seq), $header, $seq]; | |
| 157 close INFILE; | |
| 158 if ($#sequences==-1) {print "\nERROR: There are zero sequences in the input file: $infile\n\n"; print STATS "No sequences found\n"; exit 1;} | |
| 159 | |
| 160 | |
| 161 my $all_contigs_length=0; | |
| 162 foreach (@sequences) {$all_contigs_length += $_->[0];} | |
| 163 if ($all_contigs_length==0) {print "\nERROR: Length of all contigs is zero\n\n"; exit 2;} | |
| 164 | |
| 165 # Find number and number of bases in reads greater than the optional cut-off length given at command-line. | |
| 166 my $num_reads_above_cutoff=0; | |
| 167 my $num_of_bases_in_reads_above_cutoff=0; | |
| 168 if ($cutoff_length>0) | |
| 169 { | |
| 170 foreach (@sequences) | |
| 171 { | |
| 172 if ($_->[0]>=$cutoff_length) {$num_of_bases_in_reads_above_cutoff+= $_->[0]; $num_reads_above_cutoff++;} | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 | |
| 177 #--------------- Gather Plots Data, Find N50, Print sorted contig file ------------------- | |
| 178 | |
| 179 my $summed_contig_length = 0; | |
| 180 my @summed_contig_data; # <-- For graph of summed length (in number of bases) versus number of contigs. | |
| 181 my @summed_contig_data_contigLens; # <-- Added by SJBridgett to get graph of summed contig length versus min. contig length included (ie. X-axis is sort of inverse of above) | |
| 182 | |
| 183 my $contig1k_count = 0; | |
| 184 my $contig1k_length = 0; | |
| 185 | |
| 186 open SORTED, ">$output_dir/sorted_contigs.fa" or die $!; | |
| 187 | |
| 188 # top row in stats file | |
| 189 #print STATS "N50\nMax contig size\nNumber of bases in contigs\nNumber of contigs\nNumber of contigs >=1kb\nNumber of contigs in N50\nNumber of bases in contigs >=1kb\nGC Content of contigs\n"; | |
| 190 | |
| 191 my $N50size=-1; | |
| 192 my $N50_contigs = 0; | |
| 193 | |
| 194 my @sorted_by_contig_length = sort {$b->[0] <=> $a->[0]} @sequences; | |
| 195 | |
| 196 ### variables and initialization for histogram (stored in @bins) | |
| 197 my $max = $sorted_by_contig_length[0][0]; | |
| 198 my $mean= $all_contigs_length/($#sequences+1); # <-- Added by Stephen Bridgett. Note: as $# gives the highest index number, so add 1 as arrays are zero-based. | |
| 199 | |
| 200 # Calculate standard deviation | |
| 201 my $sumsquares = 0; | |
| 202 foreach (@sequences) {$sumsquares += ($_->[0] - $mean) ** 2;} # <-- Taken from John's "mean_fasta_length.pl" script. | |
| 203 my $stddev = ( $sumsquares/($#sequences+1) ) ** 0.5; | |
| 204 | |
| 205 my $min = 0; | |
| 206 # Aim for approximately 100 bins, so | |
| 207 | |
| 208 my $bin_size=1; | |
| 209 my $min_max_range=$max - $min; | |
| 210 # $bin_size = ($min_max_range)/(99); # (99 is 100-1) so 1000/100 | |
| 211 if ($min_max_range>=100000000) {$bin_size=1000000;} | |
| 212 elsif ($min_max_range>=10000000) {$bin_size=100000;} | |
| 213 elsif ($min_max_range>=1000000) {$bin_size=10000;} | |
| 214 elsif ($min_max_range>=100000) {$bin_size=1000;} | |
| 215 elsif ($min_max_range>=10000) {$bin_size=100;} | |
| 216 else {$bin_size=10;} # elsif ($min_max_range>=1000) {$bin_size=10;} | |
| 217 #elsif ($min_max_range>=100) {$bin_size=1;} | |
| 218 #elsif ($min_max_range>=10) {} | |
| 219 #elsif ($min_max_range>=1) {} | |
| 220 # WAS: my $bin_size = ($type eq 'contig') ? 1000 : 10; | |
| 221 | |
| 222 my @bins; | |
| 223 $#bins = int(($min_max_range)/$bin_size) + 1; # <-- Set the bins array size. | |
| 224 foreach (@bins) {$_ = 0}; | |
| 225 | |
| 226 foreach (@sorted_by_contig_length) { | |
| 227 | |
| 228 my $curr_contig_length = $_->[0]; | |
| 229 push @summed_contig_data_contigLens, $curr_contig_length; # <-- added by Stephen. | |
| 230 | |
| 231 $bins[int(($curr_contig_length + 1 - $min)/$bin_size)]++; | |
| 232 | |
| 233 $summed_contig_length += $curr_contig_length; | |
| 234 push @summed_contig_data, $summed_contig_length; | |
| 235 | |
| 236 ### sorted contigs file | |
| 237 print SORTED $_->[1] . $_->[2] . "\n"; | |
| 238 | |
| 239 if ($curr_contig_length >= 1000) { | |
| 240 $contig1k_count++; | |
| 241 $contig1k_length += $curr_contig_length; | |
| 242 } | |
| 243 | |
| 244 $N50_contigs++ unless ($N50size>-1); # Was unless $N50_found | |
| 245 | |
| 246 if ($summed_contig_length > ($all_contigs_length / 2) and $N50size == -1) { | |
| 247 $N50size = $curr_contig_length; | |
| 248 } | |
| 249 } | |
| 250 | |
| 251 | |
| 252 if ($bucket1!=0) | |
| 253 { | |
| 254 =pod | |
| 255 # This firsdt method works and agress with the second, but the lengths are in reverse order, at the @sorted_by_contig_length array was sorted with longest contig first. | |
| 256 open BUCKET1, ">$output_dir/lengths_hist1.txt" or die "Failed to open file '$output_dir/lengths_hist1.txt' : $!\n"; | |
| 257 print BUCKET1 "Length\tFrequency\n"; | |
| 258 my $len=-1; | |
| 259 my $count=0; | |
| 260 foreach (@sorted_by_contig_length) | |
| 261 { | |
| 262 if ( $len != $_->[0] ) {if ($len>-1) {print BUCKET1 "$len\t$count\n";} $len=$_->[0]; $count=0;} | |
| 263 $count++; | |
| 264 } | |
| 265 if ($len>-1) {print BUCKET1 "$len\t$count\n";} # Print length of final length grouping. | |
| 266 close BUCKET1; | |
| 267 =cut | |
| 268 open BUCKET1, ">$output_dir/lengths_hist1_with_zeros.txt" or die "Failed to open file '$output_dir/lengths_hist1_with_zeros.txt' : $!\n"; | |
| 269 print BUCKET1 "Length\tFrequency\n"; | |
| 270 my @bucket=(); # To check the result by using array. | |
| 271 foreach (@sequences) | |
| 272 { | |
| 273 my $len=$_->[0]; | |
| 274 if (defined $bucket[$len]) {$bucket[$len]++;} | |
| 275 else {$bucket[$len]=1;} | |
| 276 } | |
| 277 for (my $i=0; $i<=$#bucket; $i++) | |
| 278 # for (my $i=$#bucket; $i>=0; $i--) # <-- for reverse order | |
| 279 { | |
| 280 if (defined $bucket[$i]) {print BUCKET1 "$i\t$bucket[$i]\n";} | |
| 281 else {print BUCKET1 "$i\t0\n";} # Can uncomment this later if don't want zeros in the output. | |
| 282 } | |
| 283 close BUCKET1; | |
| 284 } | |
| 285 | |
| 286 | |
| 287 my $type_plural=$type.'s'; | |
| 288 print STATS $Lh."Statistics for $type lengths:".$Mhnotab.$Rh."\n"; | |
| 289 print STATS $L."Min $type length:".$M.&commify_if($sorted_by_contig_length[$#sequences][0],$doCommify).$R."\n"; | |
| 290 print STATS $L."Max $type length:".$M.&commify_if($max,$doCommify).$R."\n"; | |
| 291 printf STATS $L."Mean %s length:".$M."%.2f".$R."\n", $type,$mean; # <-- Added by Stephen Bridgett, April 2009. | |
| 292 printf STATS $L."Standard deviation of %s length:".$M."%.2f".$R."\n", $type,$stddev; ## <-- Added by Stephen Bridgett, May 2009. | |
| 293 print STATS $L."Median $type length:".$M.&commify_if($sorted_by_contig_length[int($#sequences/2)][0],$doCommify).$R."\n"; | |
| 294 print STATS $L."N50 $type length:".$M.&commify_if($N50size,$doCommify).$R."\n"; | |
| 295 | |
| 296 print STATS $Lhnewline."Statistics for numbers of $type_plural:".$Mhnotab.$Rh."\n"; | |
| 297 print STATS $L."Number of $type_plural:".$M.&commify_if($#sequences+1,$doCommify).$R."\n"; | |
| 298 print STATS $L."Number of $type_plural >=1kb:".$M.&commify_if($contig1k_count,$doCommify).$R."\n"; | |
| 299 print STATS $L."Number of $type_plural in N50:".$M.&commify_if($N50_contigs,$doCommify).$R."\n"; | |
| 300 | |
| 301 print STATS $Lhnewline."Statistics for bases in the $type_plural:".$Mhnotab.$Rh."\n"; | |
| 302 print STATS $L."Number of bases in all $type_plural:".$M.&commify_if($all_contigs_length,$doCommify).$R."\n"; | |
| 303 print STATS $L."Number of bases in $type_plural >=1kb:".$M.&commify_if($contig1k_length,$doCommify).$R."\n"; | |
| 304 printf STATS $L."GC Content of %s:".$M."%.2f %%".$R."\n", $type_plural,(100*$gc_count/$all_contigs_length); | |
| 305 | |
| 306 if ($cutoff_length>0) | |
| 307 { | |
| 308 print STATS $Lhnewline."Statistics for $type_plural >= $cutoff_length bp in length:".$Mhnotab.$Rh."\n"; | |
| 309 print STATS $L."Number of $type_plural >= $cutoff_length bp:".$M.&commify($num_reads_above_cutoff,$doCommify).$R."\n"; | |
| 310 print STATS $L."\tNumber of bases in $type_plural >= $cutoff_length bp:".$M.&commify($num_of_bases_in_reads_above_cutoff,$doCommify).$R."\n"; | |
| 311 } | |
| 312 | |
| 313 if ($repeats==1) {&countRepeats();} | |
| 314 | |
| 315 print STATS "\n"; | |
| 316 | |
| 317 | |
| 318 # Output random selection of reads if requested on commandline: | |
| 319 my $fastaLineLen=60; # <-- The line length used for 454 sffinfo output, but could use a value read from input file (but be careful not to read a short line) | |
| 320 if ($num_random_reads_to_output>0) | |
| 321 { | |
| 322 my @randlist; | |
| 323 if ($num_random_reads_to_output<($#sequences+1)) | |
| 324 { | |
| 325 print STATS "\nSome randomly selected reads:\n\n"; | |
| 326 @randlist= &getListOfRandomNumbers($#sequences, $num_random_reads_to_output); # Don't use ($#sequences + 1), just ($#sequences) otherwise would be outside the array. | |
| 327 } | |
| 328 else # Just print all the sequences: | |
| 329 { | |
| 330 print STATS "\nAll ".($#sequences+1)." reads:\n\n"; | |
| 331 for (my $i=0;$i<=$#sequences;$i++) {push @randlist,$i;} | |
| 332 } | |
| 333 &print_sequences(\@randlist) | |
| 334 } | |
| 335 | |
| 336 | |
| 337 # Print the longest read: | |
| 338 if ($longest_read>0) | |
| 339 { | |
| 340 my $length_of_longest_read=-1; | |
| 341 my @longest_read=(); | |
| 342 my $i=0; | |
| 343 foreach (@sequences) | |
| 344 { | |
| 345 if ($_->[0]>$length_of_longest_read) {$length_of_longest_read=$_->[0]; $longest_read[0]=$i;} | |
| 346 $i++; | |
| 347 } | |
| 348 if ($length_of_longest_read>0) {print STATS "\nLongest read:\n"} | |
| 349 &print_sequences(\@longest_read); | |
| 350 } | |
| 351 | |
| 352 | |
| 353 =pod | |
| 354 print STATS "\n$type\tSummed\nlength\tlength\n"; # <-- Added by Stephen Bridgett, but better to produce a graph instead. | |
| 355 | |
| 356 my $i=0; | |
| 357 foreach (@summed_contig_data) { | |
| 358 # print STATS $sorted_by_contig_length[$i]->[0]."\t".$summed_contig_data_contigLens[$i]."\t".$_."\t".$summed_contig_data[$i]."\n"; | |
| 359 print STATS $sorted_by_contig_length[$i]->[0]."\t".$_."\n"; | |
| 360 $i++; | |
| 361 } | |
| 362 =cut | |
| 363 | |
| 364 open SUMMED, ">$output_dir/summed_contig_lengths.dat" or die $!; | |
| 365 print SUMMED join "\n",@summed_contig_data; | |
| 366 close SUMMED; | |
| 367 | |
| 368 open HISTOGRAMBINS, ">$output_dir/histogram_bins.dat" or die $!; | |
| 369 my $bin_size_counter = 0; | |
| 370 foreach (@bins) { | |
| 371 print HISTOGRAMBINS eval($bin_size_counter++ * $bin_size + $bin_size/2) . "\t$_\n"; | |
| 372 } | |
| 373 close HISTOGRAMBINS; | |
| 374 | |
| 375 | |
| 376 # Graph of cumulative (summed) number of reads on y-axis, versus length of read (decending order) on x-axis | |
| 377 open SUMREAD_READLEN, ">$output_dir/sum_reads_vs_read_len.dat" or die $!; | |
| 378 #my $read_counter= 0; | |
| 379 my $read_counter= $#sorted_by_contig_length+1; | |
| 380 | |
| 381 foreach (@sorted_by_contig_length) { | |
| 382 # $read_counter++; | |
| 383 $read_counter--; | |
| 384 print SUMREAD_READLEN "$_->[0]\n"; # $read_counter | |
| 385 } | |
| 386 close SUMREAD_READLEN; | |
| 387 | |
| 388 | |
| 389 | |
| 390 | |
| 391 my $properType=ucfirst($type); # Makes the first letter an upper case letter, ie. 'Config' or 'Read' | |
| 392 #if ($type eq 'contig') | |
| 393 # { | |
| 394 # print the outcome of the gnu_plot as may have a write permissions error sometimes. | |
| 395 my $YHistogramScaleType = ($type eq 'read') ? '' : 'log y'; # Not using log scale for reads, just for contig/isotigs. | |
| 396 &plot_graph('histogram', "$output_dir/histogram_bins.dat", "Histogram of $type lengths", "$properType length", "Number of $type_plural", '0.9', $YHistogramScaleType); | |
| 397 &plot_graph('line', "$output_dir/summed_contig_lengths.dat", "Summed $type lengths", "Number of $type_plural", "Summed $type length in bases", '0.9', ''); | |
| 398 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", "X-axis gives the Number of $type_plural that are greater than the $properType-length given on the Y-axis", "$properType length", "Cummulative number of $type_plural", '0.9', ''); | |
| 399 | |
| 400 =pod | |
| 401 # print `gnuplot_histogram.sh $output_dir/histogram_bins.dat`; | |
| 402 | |
| 403 &plot_graph("$output_dir/summed_contig_lengths.dat", 'Summed contig lengths', 'Number of contigs', 'Summed contig length in bases', '0.9', ''); | |
| 404 # print `gnuplot_summedcontigs.sh $output_dir/summed_contig_lengths.dat`; | |
| 405 | |
| 406 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of contigs that are greater than the Contig-length given on the Y-axis', 'Contig length', 'Cummulative number of contigs', '0.9', ''); | |
| 407 # print `gnuplot_sum_contig_vs_contig_len.sh $output_dir/sum_reads_vs_read_len.dat`; | |
| 408 } | |
| 409 elsif ($type eq 'read') | |
| 410 { | |
| 411 | |
| 412 # print `gnuplot_readshistogram_logY.sh $output_dir/histogram_bins.dat`; # There's also optionally a "...._linearY.sh" | |
| 413 | |
| 414 &plot_graph('line', "$output_dir/summed_contig_lengths.dat",'Summed read lengths', 'Number of reads', 'Summed read length in bases', '0.9', ''); | |
| 415 # print `gnuplot_summedreads.sh $output_dir/summed_contig_lengths.dat`; | |
| 416 | |
| 417 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of reads that are greater than the Read-length given on the Y-axis', 'Read length', 'Cummulative number of reads', '0.9', ''); | |
| 418 # print `gnuplot_sum_reads_vs_read_len.sh $output_dir/sum_reads_vs_read_len.dat`; | |
| 419 } | |
| 420 else {die "\n** ERROR: Invalid type='$type' **\n\n";} | |
| 421 =cut | |
| 422 | |
| 423 close SORTED; | |
| 424 close STATS; | |
| 425 | |
| 426 | |
| 427 # Use pipe to plot directly with gnuplot, rather than calling a separate shell script: | |
| 428 # http://www.vioan.ro/wp/2008/09/30/calling-gnuplot-from-perl/ | |
| 429 # http://forums.devshed.com/perl-programming-6/plotting-with-gnuplot-within-perl-script-549682.html | |
| 430 # Another option is the perl module: GnuplotIF: http://lug.fh-swf.de/perl/GnuplotIF.html OR: http://lug.fh-swf.de/perl/ | |
| 431 | |
| 432 # PlPlot: Perl: http://search.cpan.org/~dhunt/PDL-Graphics-PLplot-0.47/plplot.pd | |
| 433 # http://plplot.sourceforge.net/ | |
| 434 # dislin: http://www.mps.mpg.de/dislin/overview.html | |
| 435 # MathGL: http://mathgl.sourceforge.net/index.html | |
| 436 | |
| 437 | |
| 438 sub plot_graph | |
| 439 { | |
| 440 # Plots a histogram or xy-line graph | |
| 441 # Parameters: GraphType (histogram/line) DataFile, Title, X-label, Y-label, Y-range | |
| 442 # Graphfile should end with '.png' | |
| 443 # The $yloglinear is 'log y' for log, or '' for linear | |
| 444 my ($graphtype, $datafile, $title,$xlabel,$ylabel,$yrange,$yloglinear)=@_; # yrange for reads: 0.1, and for contigs: 0.9 | |
| 445 | |
| 446 my $graphstyle=''; | |
| 447 if ($graphtype eq 'histogram') {$graphstyle="plot \"$datafile\" using 1:2 with boxes";} | |
| 448 elsif ($graphtype eq 'line') {$graphstyle="plot \"$datafile\" using 1 with lines";} | |
| 449 else {die "\n** ERROR: Invalid graphtype='$graphtype'\n\n";} | |
| 450 my $yloglinearscale= ($yloglinear eq '') ? '' : "set $yloglinear"; | |
| 451 | |
| 452 # To capture any errors that are normally sent from gnuplot to stderr, could use: open3 pipe interface: | |
| 453 # http://www.clintoneast.com/articles/perl-open3-example.php | |
| 454 # http://hell.org.ua/Docs/oreilly/perl2/prog/ch16_03.htm | |
| 455 # But the following should be fine, as the stderr will display when running the script anyway. | |
| 456 # If needed a simpler way would be to sent the output to a file using eg: open (GNUPLOT, "|gnuplot > gnu_out.txt 2>&1") or die .... The read the resulting file. | |
| 457 open (GNUPLOT, "|gnuplot") or die "\n**ERROR: Failed to open gnuplot : $!\n\n **"; | |
| 458 print GNUPLOT <<ENDPLOT; | |
| 459 set terminal png | |
| 460 set output "$datafile.png" | |
| 461 set nokey | |
| 462 $yloglinearscale | |
| 463 set xlabel "$xlabel" | |
| 464 set ylabel "$ylabel" | |
| 465 set yrange [$yrange:] | |
| 466 set title "$title" | |
| 467 $graphstyle | |
| 468 ENDPLOT | |
| 469 close(GNUPLOT); | |
| 470 if ($? != 0) {print "\n** WARNING: GNUplot pipe returned non-zero status: '$?'\n\n";} # $? is the status returned by the last pipe close (or backtick or system operator) | |
| 471 if (! -e "$datafile.png") {die "\n** ERROR: Failed to create '$datafile.png'**\n\n";} | |
| 472 | |
| 473 =pod | |
| 474 #PNG | |
| 475 set term png small xFFFFFF | |
| 476 set output "$file.png" | |
| 477 set size 1 ,1 | |
| 478 set nokey | |
| 479 set data style line | |
| 480 set xlabel "frequency" font "VeraMono,10" | |
| 481 set title "Fast Fourier Transform" font "VeraMono,10" | |
| 482 set grid xtics ytics | |
| 483 set xtics 100 | |
| 484 plot "$file" using 1:2 w lines 1 | |
| 485 =cut | |
| 486 | |
| 487 #WAS PREVIOUSLY AS .sh script | |
| 488 =pod | |
| 489 # The 'gnuplot_readshistogram_logY.sh' is: | |
| 490 set terminal png | |
| 491 set output "$1.png" | |
| 492 set log y | |
| 493 set xlabel "Read length" | |
| 494 set ylabel "Frequency" | |
| 495 set yrange [0.9:] | |
| 496 set title "Histogram of read lengths" | |
| 497 plot "$1" using 1:2 with boxes | |
| 498 =cut | |
| 499 } | |
| 500 | |
| 501 | |
| 502 # Was previously a separate .sh file: | |
| 503 =pod | |
| 504 #!/bin/sh | |
| 505 gnuplot << EOF | |
| 506 set terminal png | |
| 507 set output "$1.png" | |
| 508 set xlabel "Number of contigs" | |
| 509 set ylabel "Summed contig length in bases" | |
| 510 set yrange [0.9:] | |
| 511 set title "Summed contig lengths" | |
| 512 plot "$1" using 1 with lines | |
| 513 EOF | |
| 514 =cut | |
| 515 | |
| 516 | |
| 517 | |
| 518 | |
| 519 # Added function to count number of simple dinucleotide repeats: | |
| 520 sub countRepeats | |
| 521 { | |
| 522 # To count the number of sequences that contain mostly repeats. | |
| 523 # This would be faster if called a C program on the file. | |
| 524 | |
| 525 # Common simple repeats are listed here: http://www.bioinfo.de/isb/2005/05/0041/ | |
| 526 # Dinucleotide | |
| 527 # AT/TA | |
| 528 # AC/TG | |
| 529 # AG/TC | |
| 530 # CG/GC | |
| 531 # Trinucleotide | |
| 532 # AAT/TTA | |
| 533 # CTA/GAT | |
| 534 # ATG/TAC | |
| 535 # ACT/TGA | |
| 536 # CTC/GAG | |
| 537 # AGG/TCC | |
| 538 # CAG/GTC | |
| 539 # AAG/TTC | |
| 540 # ATA/TAT | |
| 541 # CAA/GTT | |
| 542 # AGC/TCG | |
| 543 # ACA/TGT | |
| 544 # ACG/TGC | |
| 545 # AGA/TCT | |
| 546 # ACC/TGG | |
| 547 # Other | |
| 548 # Tetranucleotide | |
| 549 # AAAT | |
| 550 # AAAC | |
| 551 # CACG | |
| 552 # AACA | |
| 553 # AATA | |
| 554 # AAGA | |
| 555 # TGAA | |
| 556 # AAAG | |
| 557 # ACAT | |
| 558 # AATG | |
| 559 # AGCC | |
| 560 # Other | |
| 561 # Pentanucleotide | |
| 562 # AAAAC | |
| 563 # AATTG | |
| 564 # GCTAA | |
| 565 # ATAAT | |
| 566 # AAAAT | |
| 567 # AAACA | |
| 568 # ATATA | |
| 569 # TTGCC | |
| 570 # Other | |
| 571 | |
| 572 # I also add mono-nucleotide repeats: - ie. just all T's, or A's, etc | |
| 573 # Just consider the dinucleotide repeats for now: | |
| 574 my ($ATseq,$CGseq,$ACseq,$TGseq,$AGseq,$TCseq)=(0,0,0,0,0,0); | |
| 575 my ($AAseq,$TTseq,$CCseq,$GGseq)=(0,0,0,0); | |
| 576 foreach (@sequences) | |
| 577 { | |
| 578 my $seq_len=$_->[0]; | |
| 579 my $seq=$_->[2]; # This copy might be slow, maybe should just stick with using the reference. | |
| 580 my $mnt=0.35*$seq_len; # Mononucleotide threshold: HERE 0.35 also means 70%; 0.4 means 80% dinucleotide repeats, as really one base so 0.5 = 100% | |
| 581 my $dnt=0.35*$seq_len; # Dinucleotide threshold: 0.35 means 70%; 0.4 means 80% dinucleotide repeats, as two bases so 0.5 = 100% | |
| 582 my ($AT,$CG,$AC,$TG,$AG,$TC)=(0,0,0,0,0,0); | |
| 583 my ($AA,$TT,$CC,$GG)=(0,0,0,0); | |
| 584 # See: http://www.allinterview.com/showanswers/76719.html | |
| 585 | |
| 586 # AT/TA seems most common repeat so process it first to save time: | |
| 587 while ($seq=~/AT/g) {$AT++;} if ($AT>$dnt) {$ATseq++; next;} # AT is same as TA. If has 80% AT's then won't have 80% AC etc. | |
| 588 while ($seq=~/CG/g) {$CG++;} if ($CG>$dnt) {$CGseq++; next;} # CG is same as GC. | |
| 589 # AC,TG | |
| 590 while ($seq=~/AC/g) {$AC++;} if ($AC>$dnt) {$ACseq++; next;} | |
| 591 while ($seq=~/TG/g) {$TG++;} if ($TG>$dnt) {$TGseq++; next;} | |
| 592 # AG/TC | |
| 593 while ($seq=~/AG/g) {$AG++;} if ($AG>$dnt) {$AGseq++; next;} | |
| 594 while ($seq=~/TC/g) {$TC++;} if ($TC>$dnt) {$TCseq++; next;} | |
| 595 | |
| 596 # Added my simple mononucleotde repeat count: | |
| 597 while ($seq=~/AA/g) {$AA++;} if ($AA>$mnt) {$AAseq++; next;} | |
| 598 while ($seq=~/TT/g) {$TT++;} if ($TT>$mnt) {$TTseq++; next;} | |
| 599 while ($seq=~/CC/g) {$CC++;} if ($CC>$mnt) {$CCseq++; next;} | |
| 600 while ($seq=~/GG/g) {$GG++;} if ($GG>$mnt) {$GGseq++; next;} | |
| 601 } | |
| 602 | |
| 603 my $num_seq=($#sequences+1); | |
| 604 my $total_din_repeats_seq= $ACseq+$TGseq+$ATseq+$AGseq+$TCseq+$CGseq; | |
| 605 my $percent_din_repeats=100*$total_din_repeats_seq/$num_seq; | |
| 606 print STATS "\nSimple Dinucleotide repeats:\n"; | |
| 607 printf STATS "\tNumber of %s with over 70%% dinucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_din_repeats, $total_din_repeats_seq, $type_plural; | |
| 608 printf STATS "\tAT:\t%.2f %% (%d %s)\n", (100*$ATseq/$num_seq),$ATseq,$type_plural; | |
| 609 printf STATS "\tCG:\t%.2f %% (%d %s)\n", (100*$CGseq/$num_seq),$CGseq,$type_plural; | |
| 610 printf STATS "\tAC:\t%.2f %% (%d %s)\n", (100*$ACseq/$num_seq),$ACseq,$type_plural; | |
| 611 printf STATS "\tTG:\t%.2f %% (%d %s)\n", (100*$TGseq/$num_seq),$TGseq,$type_plural; | |
| 612 printf STATS "\tAG:\t%.2f %% (%d %s)\n", (100*$AGseq/$num_seq),$AGseq,$type_plural; | |
| 613 printf STATS "\tTC:\t%.2f %% (%d %s)\n", (100*$TCseq/$num_seq),$TCseq,$type_plural; | |
| 614 | |
| 615 my $total_mono_repeats_seq= $AAseq+$TTseq+$CCseq+$GGseq; | |
| 616 my $percent_mono_repeats=100*$total_mono_repeats_seq/$num_seq; | |
| 617 print STATS "\nSimple mononucleotide repeats:\n"; | |
| 618 printf STATS "\tNumber of %s with over 50%% mononucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_mono_repeats, $total_mono_repeats_seq, $type_plural; | |
| 619 printf STATS "\tAA:\t%.2f %% (%d %s)\n", (100*$AAseq/$num_seq),$AAseq,$type_plural; | |
| 620 printf STATS "\tTT:\t%.2f %% (%d %s)\n", (100*$TTseq/$num_seq),$TTseq,$type_plural; | |
| 621 printf STATS "\tCC:\t%.2f %% (%d %s)\n", (100*$CCseq/$num_seq),$CCseq,$type_plural; | |
| 622 printf STATS "\tGG:\t%.2f %% (%d %s)\n", (100*$GGseq/$num_seq),$GGseq,$type_plural; | |
| 623 | |
| 624 return $percent_din_repeats+$percent_mono_repeats; | |
| 625 } | |
| 626 | |
| 627 | |
| 628 sub commify_if | |
| 629 { | |
| 630 # If doCommify is >0 then converts output to commas. | |
| 631 # Formats '1234567890.01' with commas as "1,234,567,890.01 | |
| 632 # Based on: http://www.perlmonks.org/?node_id=110137 | |
| 633 my ($number,$doCommify)=@_; | |
| 634 if ($doCommify > 0) {$number =~ s/(\d)(?=(\d{3})+(\D|$))/$1\,/g;} | |
| 635 return $number; | |
| 636 } | |
| 637 | |
| 638 | |
| 639 #--------------- Produce ordered list of random numbers ------------------- | |
| 640 # This is copied from: my_random_contigs.pl | |
| 641 | |
| 642 sub getListOfRandomNumbers | |
| 643 { | |
| 644 # Use: @list=getListOfRandomNumbers(200,20); to return sorted list of 20 numbers in range from 0 to 200 inclusive. | |
| 645 my %list2=(); | |
| 646 my $i=0; | |
| 647 my $MaxNumber=$_[0]; | |
| 648 my $NumToPick=$_[1]; | |
| 649 while ($i<$NumToPick) | |
| 650 { | |
| 651 my $intRand = int(rand($MaxNumber+1)); # For Zero-based perl-arrays. The +1 means this generates random integers between 0 and $MaxNumber. (See: http://perldoc.perl.org/functions/rand.html ) | |
| 652 if ($intRand>$MaxNumber) {$intRand=$MaxNumber} # Just to be extra sure that don't exceed $MaxNumber. | |
| 653 if ( !exists($list2{$intRand}) ) {$list2{$intRand}=1; $i++;} | |
| 654 } | |
| 655 #foreach my $key(keys %list2) {print "$key ";} | |
| 656 # Sort the list of numbers: | |
| 657 #my @SortedList2 = sort { $a <=> $b } keys(%list2); | |
| 658 #return @SortedList2; | |
| 659 return (sort { $a <=> $b } keys(%list2)); | |
| 660 #print "Sorted list of ".$NumToPick." random numbers:\n"; | |
| 661 #foreach my $num(@SortedList2) {print "$num\n";} | |
| 662 #print "\n\n"; | |
| 663 } | |
| 664 | |
| 665 | |
| 666 sub print_sequences | |
| 667 { | |
| 668 # Print the sequences wrapping sequences using index array, at line length of '$fastaLineLen' characters: | |
| 669 # Uses the global '@sequences' array. | |
| 670 my $sequence_indexes_list=$_[0]; # This is an array reference, not the array itself. | |
| 671 foreach my $num(@{$sequence_indexes_list}) | |
| 672 { | |
| 673 #print "$num (max=$#sequences)\n"; | |
| 674 print STATS $sequences[$num]->[1]; # Prints the header, no "\n" needed after it. | |
| 675 my $pos=0; | |
| 676 my $seqlen=$sequences[$num]->[0]; | |
| 677 while ($pos<$seqlen) | |
| 678 { | |
| 679 print STATS substr($sequences[$num]->[2],$pos,$fastaLineLen)."\n"; | |
| 680 $pos+=$fastaLineLen; | |
| 681 } | |
| 682 print STATS "\n"; | |
| 683 } | |
| 684 } | |
| 685 | |
| 686 | |
| 687 =pod | |
| 688 # Some test runs for mono-nucleotides and dinucelotides: | |
| 689 >FUOMOGO01AQV42DUMMYA length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_ | |
| 690 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 691 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 692 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 693 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 694 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 695 >FUOMOGO01AQV42DUMMYB length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_ | |
| 696 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 697 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 698 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 699 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 700 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | |
| 701 >FUOMOGO01AQV42DUMMYC length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_ | |
| 702 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT | |
| 703 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT | |
| 704 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT | |
| 705 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT | |
| 706 >FUOMOGO01AQV42DUMMYD length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_ | |
| 707 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG | |
| 708 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG | |
| 709 | |
| 710 >FUOMOGO01AQV42 length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_ | |
| 711 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 712 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 713 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 714 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 715 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 716 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT | |
| 717 >FUOMOGO01AUK0D length=214 xy=0231_0843 region=1 run=R_2009_04_23_17_54_06_ | |
| 718 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 719 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 720 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 721 ACACACACACACACACACACACGACGACGACGAC | |
| 722 >FUOMOGO01AUB7C length=64 xy=0228_1718 region=1 run=R_2009_04_23_17_54_06_ | |
| 723 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTACG | |
| 724 TACG | |
| 725 >FUOMOGO01AU00B length=213 xy=0236_1097 region=1 run=R_2009_04_23_17_54_06_ | |
| 726 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 727 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 728 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC | |
| 729 ACACACACACACACACACACGACGACGACGACG | |
| 730 >FUOMOGO01ATYRT length=169 xy=0224_0695 region=1 run=R_2009_04_23_17_54_06_ | |
| 731 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 732 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 733 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT | |
| 734 >FUOMOGO01ARMLN length=400 xy=0197_2201 region=1 run=R_2009_04_23_17_54_06_ | |
| 735 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 736 TATAGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATA | |
| 737 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 738 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 739 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 740 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 741 TATATATATATATATATATATATATATATATATATATATA | |
| 742 >FUOMOGO01AVGRX length=44 xy=0241_1051 region=1 run=R_2009_04_23_17_54_06_ | |
| 743 TATATATATATATATATATATATATATATATATATATATATATA | |
| 744 >FUOMOGO01ASZ6K length=315 xy=0213_0922 region=1 run=R_2009_04_23_17_54_06_ | |
| 745 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 746 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 747 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 748 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 749 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG | |
| 750 TGTGTGTGTGTGTGT | |
| 751 >FUOMOGO01ARSZF length=65 xy=0199_2281 region=1 run=R_2009_04_23_17_54_06_ | |
| 752 TATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTAC | |
| 753 GTACG | |
| 754 >FUOMOGO01AYV8U length=49 xy=0280_1324 region=1 run=R_2009_04_23_17_54_06_ | |
| 755 ATATATATATATATATATATATATATATATATATATATATATATATATA | |
| 756 >FUOMOGO01AYV9X length=40 xy=0280_1363 region=1 run=R_2009_04_23_17_54_06_ | |
| 757 TATATATATATATATATATATATATATATATATATATATA | |
| 758 >FUOMOGO01AUX4M length=40 xy=0235_1460 region=1 run=R_2009_04_23_17_54_06_ | |
| 759 TATATATATATATATATATATATATATATATATATATATA | |
| 760 >FUOMOGO01AWOTU length=54 xy=0255_0800 region=1 run=R_2009_04_23_17_54_06_ | |
| 761 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTA | |
| 762 >FUOMOGO01A11TC length=66 xy=0316_1054 region=1 run=R_2009_04_23_17_54_06_ | |
| 763 ATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTA | |
| 764 CGTACG | |
| 765 >FUOMOGO01ASRJP length=401 xy=0210_2019 region=1 run=R_2009_04_23_17_54_06_ | |
| 766 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTATAT | |
| 767 AGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATATAT | |
| 768 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT | |
| 769 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT | |
| 770 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT | |
| 771 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT | |
| 772 ATATATATATATATATATATATATATATATATATATATATA | |
| 773 >FUOMOGO01AU1ZH length=67 xy=0236_2363 region=1 run=R_2009_04_23_17_54_06_ | |
| 774 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGT | |
| 775 ACGTACG | |
| 776 =cut |
