Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/export2sam.pl @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/usr/bin/env perl | |
2 # | |
3 # | |
4 # export2sam.pl converts GERALD export files to SAM format. | |
5 # | |
6 # | |
7 # | |
8 ########## License: | |
9 # | |
10 # The MIT License | |
11 # | |
12 # Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. | |
13 # Modified SAMtools work copyright (c) 2010 Illumina, Inc. | |
14 # | |
15 # Permission is hereby granted, free of charge, to any person obtaining a copy | |
16 # of this software and associated documentation files (the "Software"), to deal | |
17 # in the Software without restriction, including without limitation the rights | |
18 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
19 # copies of the Software, and to permit persons to whom the Software is | |
20 # furnished to do so, subject to the following conditions: | |
21 # | |
22 # The above copyright notice and this permission notice shall be included in | |
23 # all copies or substantial portions of the Software. | |
24 # | |
25 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
26 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
27 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
28 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
29 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
30 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
31 # THE SOFTWARE. | |
32 # | |
33 # | |
34 # | |
35 # | |
36 ########## ChangeLog: | |
37 # | |
38 # Version: 2.3.1 (18MAR2011) | |
39 # | |
40 # - Restore file '-' as stdin input. | |
41 # | |
42 # Version: 2.3.0 (24JAN2011) | |
43 # | |
44 # - Add support for export reserved chromosome name "CONTROL", | |
45 # which is translated to optional field "XC:Z:CONTROL". | |
46 # - Check for ".gz" file extension on export files and open | |
47 # these as gzip pipes when the extension is found. | |
48 # | |
49 # Version: 2.2.0 (16NOV2010) | |
50 # | |
51 # - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y | |
52 # - For export records with reserved chromosome name identifiers | |
53 # "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" | |
54 # to the SAM record, so that these cases can be distinguished | |
55 # from other unmatched reads. | |
56 # | |
57 # Version: 2.1.0 (21SEP2010) | |
58 # | |
59 # - Additional export record error checking. | |
60 # - Convert export records with chromomsome value of "RM" to unmapped | |
61 # SAM records. | |
62 # | |
63 # Version: 2.0.0 (15FEB2010) | |
64 # | |
65 # Script updated by Illumina in conjunction with CASAVA 1.7.0 | |
66 # release. | |
67 # | |
68 # Major changes are as follows: | |
69 # - The CIGAR string has been updated to include all gaps from | |
70 # ELANDv2 alignments. | |
71 # - The ELAND single read alignment score is always stored in the | |
72 # optional "SM" field and the ELAND paired read alignment score | |
73 # is stored in the optional "AS" field when it exists. | |
74 # - The MAPQ value is set to the higher of the two alignment scores, | |
75 # but no greater than 254, i.e. min(254,max(SM,AS)) | |
76 # - The SAM "proper pair" bit (0x0002) is now set for read pairs | |
77 # meeting ELAND's expected orientation and insert size criteria. | |
78 # - The default quality score translation is set for export files | |
79 # which contain Phread+64 quality values. An option, | |
80 # "--qlogodds", has been added to translate quality values from | |
81 # the Solexa+64 format used in export files prior to Pipeline | |
82 # 1.3 | |
83 # - The export match descriptor is now reverse-complemented when | |
84 # necessary such that it always corresponds to the forward | |
85 # strand of the reference, to be consistent with other | |
86 # information in the SAM record. It is now written to the | |
87 # optional 'XD' field (rather than 'MD') to acknowledge its | |
88 # minor differences from the samtools match descriptor (see | |
89 # additional detail below). | |
90 # - An option, "--nofilter", has been added to include reads which | |
91 # have failed primary analysis quality filtration. Such reads | |
92 # will have the corresponding SAM flag bit (0x0200) set. | |
93 # - Labels in the export 'contig' field are preserved by setting | |
94 # RNAME to "$export_chromosome/$export_contig" when the contig | |
95 # label exists. | |
96 # | |
97 # | |
98 # Contact: lh3 | |
99 # Version: 0.1.2 (03JAN2009) | |
100 # | |
101 # | |
102 # | |
103 ########## Known Conversion Limitations: | |
104 # | |
105 # - Export records for reads that map to a position < 1 (allowed | |
106 # in export format), are converted to unmapped reads in the SAM | |
107 # record. | |
108 # - Export records contain the reserved chromosome names: "NM", | |
109 # "QC","RM" and "CONTROL". "NM" indicates that the aligner could | |
110 # not map the read to the reference sequence set. "QC" means that | |
111 # the aligner did not attempt to map the read due to some | |
112 # technical limitation. "RM" means that the read mapped to a set | |
113 # of 'contaminant' sequences specified in GERALD's RNA-seq | |
114 # workflow. "CONTROL" means that the read is a control. All of | |
115 # these alignment types are collapsed to the single unmapped | |
116 # alignment state in the SAM record, but the optional SAM "XC" | |
117 # field is used to record the original reserved chromosome name of | |
118 # the read for all but the "NM" case. | |
119 # - The export match descriptor is slightly different than the | |
120 # samtools match descriptor. For this reason it is stored in the | |
121 # optional SAM field 'XD' (and not 'MD'). Note that the export | |
122 # match descriptor differs from the samtools version in two | |
123 # respects: (1) indels are explicitly closed with the '$' | |
124 # character and (2) insertions must be enumerated in the match | |
125 # descriptor. For example a 35-base read with a two-base insertion | |
126 # is described as: 20^2$14 | |
127 # | |
128 # | |
129 # | |
130 | |
131 my $version = "2.3.1"; | |
132 | |
133 use strict; | |
134 use warnings; | |
135 | |
136 use Getopt::Long; | |
137 use File::Spec; | |
138 use List::Util qw(min max); | |
139 | |
140 | |
141 use constant { | |
142 EXPORT_MACHINE => 0, | |
143 EXPORT_RUNNO => 1, | |
144 EXPORT_LANE => 2, | |
145 EXPORT_TILE => 3, | |
146 EXPORT_X => 4, | |
147 EXPORT_Y => 5, | |
148 EXPORT_INDEX => 6, | |
149 EXPORT_READNO => 7, | |
150 EXPORT_READ => 8, | |
151 EXPORT_QUAL => 9, | |
152 EXPORT_CHROM => 10, | |
153 EXPORT_CONTIG => 11, | |
154 EXPORT_POS => 12, | |
155 EXPORT_STRAND => 13, | |
156 EXPORT_MD => 14, | |
157 EXPORT_SEMAP => 15, | |
158 EXPORT_PEMAP => 16, | |
159 EXPORT_PASSFILT => 21, | |
160 EXPORT_SIZE => 22, | |
161 }; | |
162 | |
163 | |
164 use constant { | |
165 SAM_QNAME => 0, | |
166 SAM_FLAG => 1, | |
167 SAM_RNAME => 2, | |
168 SAM_POS => 3, | |
169 SAM_MAPQ => 4, | |
170 SAM_CIGAR => 5, | |
171 SAM_MRNM => 6, | |
172 SAM_MPOS => 7, | |
173 SAM_ISIZE => 8, | |
174 SAM_SEQ => 9, | |
175 SAM_QUAL => 10, | |
176 }; | |
177 | |
178 | |
179 # function prototypes for Richard's code | |
180 sub match_desc_to_cigar($); | |
181 sub match_desc_frag_length($); | |
182 sub reverse_compl_match_descriptor($); | |
183 sub write_header($;$;$); | |
184 | |
185 | |
186 &export2sam; | |
187 exit; | |
188 | |
189 | |
190 | |
191 | |
192 sub export2sam { | |
193 | |
194 my $cmdline = $0 . " " . join(" ",@ARGV); | |
195 my $arg_count = scalar @ARGV; | |
196 my $progname = (File::Spec->splitpath($0))[2]; | |
197 | |
198 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values | |
199 my $is_nofilter = 0; | |
200 my $read1file; | |
201 my $read2file; | |
202 my $print_version = 0; | |
203 my $help = 0; | |
204 | |
205 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, | |
206 "nofilter" => \$is_nofilter, | |
207 "read1=s" => \$read1file, | |
208 "read2=s" => \$read2file, | |
209 "version" => \$print_version, | |
210 "help" => \$help ); | |
211 | |
212 my $usage = <<END; | |
213 | |
214 $progname converts GERALD export files to SAM format. | |
215 | |
216 Usage: $progname --read1=FILENAME [ options ] | --version | --help | |
217 | |
218 --read1=FILENAME read1 export file or '-' for stdin (mandatory) | |
219 (file may be gzipped with ".gz" extension) | |
220 --read2=FILENAME read2 export file or '-' for stdin | |
221 (file may be gzipped with ".gz" extension) | |
222 --nofilter include reads that failed the basecaller | |
223 purity filter | |
224 --qlogodds assume export file(s) use logodds quality values | |
225 as reported by OLB (Pipeline) prior to v1.3 | |
226 (default: phred quality values) | |
227 | |
228 END | |
229 | |
230 my $version_msg = <<END; | |
231 | |
232 $progname version: $version | |
233 | |
234 END | |
235 | |
236 if((not $result) or $help or ($arg_count==0)) { | |
237 die($usage); | |
238 } | |
239 | |
240 if(@ARGV) { | |
241 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; | |
242 die($usage); | |
243 } | |
244 | |
245 if($print_version) { | |
246 die($version_msg); | |
247 } | |
248 | |
249 if(not defined($read1file)) { | |
250 print STDERR "\nERROR: read1 export file must be specified\n\n"; | |
251 die($usage); | |
252 } | |
253 | |
254 unless((-f $read1file) or ($read1file eq '-')) { | |
255 die("\nERROR: Can't find read1 export file: '$read1file'\n\n"); | |
256 } | |
257 | |
258 if (defined $read2file) { | |
259 unless((-f $read2file) or ($read2file eq '-')) { | |
260 die("\nERROR: Can't find read2 export file: '$read2file'\n\n"); | |
261 } | |
262 if($read1file eq $read2file) { | |
263 die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n"); | |
264 } | |
265 } | |
266 | |
267 my ($fh1, $fh2, $is_paired); | |
268 | |
269 my $read1cmd="$read1file"; | |
270 $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/); | |
271 open($fh1, $read1cmd) | |
272 or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n"); | |
273 $is_paired = defined $read2file; | |
274 if ($is_paired) { | |
275 my $read2cmd="$read2file"; | |
276 $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/); | |
277 open($fh2, $read2cmd) | |
278 or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n"); | |
279 } | |
280 # quality value conversion table | |
281 my @conv_table; | |
282 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): | |
283 for (-64..64) { | |
284 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); | |
285 } | |
286 } else { # convert from phred+64 quality values (pipeline v1.3+): | |
287 for (-64..-1) { | |
288 $conv_table[$_+64] = undef; | |
289 } | |
290 for (0..64) { | |
291 $conv_table[$_+64] = int(33 + $_); | |
292 } | |
293 } | |
294 # write the header | |
295 print write_header( $progname, $version, $cmdline ); | |
296 # core loop | |
297 my $export_line_count = 0; | |
298 while (<$fh1>) { | |
299 $export_line_count++; | |
300 my (@s1, @s2); | |
301 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); | |
302 if ($is_paired) { | |
303 my $read2line = <$fh2>; | |
304 if(not $read2line){ | |
305 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); | |
306 } | |
307 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); | |
308 | |
309 if (@s1 && @s2) { # then set mate coordinate | |
310 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ | |
311 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); | |
312 } | |
313 | |
314 my $isize = 0; | |
315 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize | |
316 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; | |
317 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; | |
318 $isize = $x2 - $x1; | |
319 } | |
320 | |
321 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ | |
322 my ($sa,$sb,$is) = @{$_}; | |
323 if ($sb->[SAM_RNAME] ne '*') { | |
324 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; | |
325 $sa->[SAM_MPOS] = $sb->[SAM_POS]; | |
326 $sa->[SAM_ISIZE] = $is; | |
327 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); | |
328 } else { | |
329 $sa->[SAM_FLAG] |= 0x8; | |
330 } | |
331 } | |
332 } | |
333 } | |
334 print join("\t", @s1), "\n" if (@s1); | |
335 print join("\t", @s2), "\n" if (@s2 && $is_paired); | |
336 } | |
337 close($fh1); | |
338 if($is_paired) { | |
339 while(my $read2line = <$fh2>){ | |
340 $export_line_count++; | |
341 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); | |
342 } | |
343 close($fh2); | |
344 } | |
345 } | |
346 | |
347 sub export2sam_aux { | |
348 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; | |
349 chomp($line); | |
350 my @t = split("\t", $line); | |
351 if(scalar(@t) < EXPORT_SIZE) { | |
352 my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; | |
353 $msg.="\t...erroneous export record:\n" . $line . "\n\n"; | |
354 die($msg); | |
355 } | |
356 @$s = (); | |
357 my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); | |
358 return if(not ($isPassFilt or $is_nofilter)); | |
359 # read name | |
360 my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); | |
361 $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), | |
362 int($t[EXPORT_X]), int($t[EXPORT_Y])); | |
363 # initial flag (will be updated later) | |
364 $s->[SAM_FLAG] = 0; | |
365 if($is_paired) { | |
366 if($t[EXPORT_READNO] != $read_no){ | |
367 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); | |
368 } | |
369 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); | |
370 } | |
371 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); | |
372 | |
373 # read & quality | |
374 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); | |
375 if ($is_export_rev) { # then reverse the sequence and quality | |
376 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); | |
377 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; | |
378 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); | |
379 } else { | |
380 $s->[SAM_SEQ] = $t[EXPORT_READ]; | |
381 $s->[SAM_QUAL] = $t[EXPORT_QUAL]; | |
382 } | |
383 my @convqual = (); | |
384 foreach (unpack('C*', $s->[SAM_QUAL])){ | |
385 my $val=$ct->[$_]; | |
386 if(not defined $val){ | |
387 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; | |
388 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } | |
389 die($msg . "\n"); | |
390 } | |
391 push @convqual,$val; | |
392 } | |
393 | |
394 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding | |
395 | |
396 | |
397 # coor | |
398 my $has_coor = 0; | |
399 $s->[SAM_RNAME] = "*"; | |
400 if (($t[EXPORT_CHROM] eq 'NM') or | |
401 ($t[EXPORT_CHROM] eq 'QC') or | |
402 ($t[EXPORT_CHROM] eq 'RM') or | |
403 ($t[EXPORT_CHROM] eq 'CONTROL')) { | |
404 $s->[SAM_FLAG] |= 0x4; # unmapped | |
405 push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); | |
406 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { | |
407 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? | |
408 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") | |
409 } elsif ($t[EXPORT_POS] < 1) { | |
410 $s->[SAM_FLAG] |= 0x4; # unmapped | |
411 } else { | |
412 $s->[SAM_RNAME] = $t[EXPORT_CHROM]; | |
413 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); | |
414 $has_coor = 1; | |
415 } | |
416 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; | |
417 | |
418 # print STDERR "t[14] = " . $t[14] . "\n"; | |
419 my $matchDesc = ''; | |
420 $s->[SAM_CIGAR] = "*"; | |
421 if($has_coor){ | |
422 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; | |
423 | |
424 if($matchDesc =~ /\^/){ | |
425 # construct CIGAR string using Richard's function | |
426 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing | |
427 } else { | |
428 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; | |
429 } | |
430 } | |
431 | |
432 # print STDERR "cigar_string = $cigar_string\n"; | |
433 | |
434 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); | |
435 if($has_coor){ | |
436 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; | |
437 my $pemap = 0; | |
438 if($is_paired) { | |
439 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; | |
440 | |
441 # set `proper pair' bit if non-blank, non-zero PE alignment score: | |
442 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); | |
443 } | |
444 $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); | |
445 } else { | |
446 $s->[SAM_MAPQ] = 0; | |
447 } | |
448 # mate coordinate | |
449 $s->[SAM_MRNM] = '*'; | |
450 $s->[SAM_MPOS] = 0; | |
451 $s->[SAM_ISIZE] = 0; | |
452 # aux | |
453 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); | |
454 if($has_coor){ | |
455 # The export match descriptor differs slightly from the samtools match descriptor. | |
456 # In order for the converted SAM files to be as compliant as possible, | |
457 # we put the export match descriptor in optional field 'XD' rather than 'MD': | |
458 push(@$s, "XD:Z:$matchDesc"); | |
459 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); | |
460 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); | |
461 } | |
462 } | |
463 | |
464 | |
465 | |
466 # | |
467 # the following code is taken from Richard Shaw's sorted2sam.pl file | |
468 # | |
469 sub reverse_compl_match_descriptor($) | |
470 { | |
471 # print "\nREVERSING THE MATCH DESCRIPTOR!\n"; | |
472 my ($match_desc) = @_; | |
473 my $rev_compl_match_desc = reverse($match_desc); | |
474 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; | |
475 | |
476 # Unreverse the digits of numbers. | |
477 $rev_compl_match_desc = join('', | |
478 map {($_ =~ /\d+/) | |
479 ? join('', reverse(split('', $_))) | |
480 : $_} split(/(\d+)/, | |
481 $rev_compl_match_desc)); | |
482 | |
483 return $rev_compl_match_desc; | |
484 } | |
485 | |
486 | |
487 | |
488 sub match_desc_to_cigar($) | |
489 { | |
490 my ($match_desc) = @_; | |
491 | |
492 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); | |
493 my $cigar_str = ''; | |
494 my $cigar_del_ch = 'D'; | |
495 my $cigar_ins_ch = 'I'; | |
496 my $cigar_match_ch = 'M'; | |
497 | |
498 foreach my $match_desc_part (@match_desc_parts) { | |
499 next if (!$match_desc_part); | |
500 | |
501 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { | |
502 # Deletion | |
503 $cigar_str .= (length($1) . $cigar_del_ch); | |
504 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { | |
505 # Insertion | |
506 $cigar_str .= ($1 . $cigar_ins_ch); | |
507 } else { | |
508 $cigar_str .= (match_desc_frag_length($match_desc_part) | |
509 . $cigar_match_ch); | |
510 } | |
511 } | |
512 | |
513 return $cigar_str; | |
514 } | |
515 | |
516 | |
517 #------------------------------------------------------------------------------ | |
518 | |
519 sub match_desc_frag_length($) | |
520 { | |
521 my ($match_desc_str) = @_; | |
522 my $len = 0; | |
523 | |
524 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); | |
525 | |
526 foreach my $match_desc_field (@match_desc_fields) { | |
527 next if ($match_desc_field eq ''); | |
528 | |
529 $len += (($match_desc_field =~ /(\d+)/) | |
530 ? $1 : length($match_desc_field)); | |
531 } | |
532 | |
533 return $len; | |
534 } | |
535 | |
536 | |
537 # argument holds the command line | |
538 sub write_header($;$;$) | |
539 { | |
540 my ($progname,$version,$cl) = @_; | |
541 my $complete_header = ""; | |
542 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; | |
543 | |
544 return $complete_header; | |
545 } |