0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 # Parses align file, calculates read depth (RD) and genome representation (GR)
|
|
4 # of individual contigs; outputs contig sequences in fasta format with this
|
|
5 # information in the fasta header:
|
|
6 # >CLXConitgY (length-average_RD-GR)
|
|
7 #
|
|
8 # RD profiles along the contig sequences can also be calculated and saved
|
|
9 # in a separate file.
|
|
10 #
|
|
11 # (renamed from align_parsing_2.pl, 2010-05-10)
|
|
12
|
|
13
|
|
14 use Getopt::Std;
|
|
15 use warnings;
|
|
16
|
|
17 getopt('iop');
|
|
18 if ($opt_i) {
|
|
19 $align_file = $opt_i; #
|
|
20 } else {
|
|
21 die "Missing parameter: -i align_file\n";
|
|
22 }
|
|
23 if ($opt_o) {
|
|
24 $out_file = $opt_o; #
|
|
25 } else {
|
|
26 $out_file = $align_file.".info";
|
|
27 print "Output file not set, using \"$out_file\"\n";
|
|
28 }
|
|
29 if ($opt_p) {
|
|
30 $profile_file = $opt_p; # RD profiles will be recorded
|
|
31 } else {
|
|
32 $profile_file = 0; # will be used below to switch profile output off
|
|
33 print "Parameter -p not set, RD profiles will not be saved\n";
|
|
34 }
|
|
35
|
|
36 open (ALIGN,$align_file) or die "Could not read from $align_file\n";
|
|
37 open (OUT,">$out_file") or die "Could not write to $out_file\n";
|
|
38 if ($profile_file) {
|
|
39 open (PROF,">$profile_file") or die "Could not write to $profile_file\n";
|
|
40 }
|
|
41
|
|
42 while ($radek = <ALIGN>) {
|
|
43 if ($radek =~/^DETAILED DISPLAY OF CONTIGS/) { # start of alignment section
|
|
44 # print $radek;
|
|
45 $radek = <ALIGN>;
|
|
46 while ($radek =~/\*\*\* (CL\d+) {0,1}Contig (\d+) \*\*\*/) { # parsing individual contigs
|
|
47 # print $radek;
|
|
48 $contig_id = $1."Contig".$2;
|
|
49 @pokryti = (); # array ve ktere bude pro kazdou pozici v sekvenci pokryti
|
|
50 $seq = ""; # consensus sekvence contigu
|
|
51 # print "$contig_id\n";
|
|
52 # print $radek;
|
|
53 $radek = <ALIGN>;
|
|
54 while ($radek =~/\: \. \:/) { # -> nasleduje dalsi blok
|
|
55 @blok = (); # radky z aktualniho bloku alignmentu
|
|
56 do {
|
|
57 $radek = <ALIGN>;
|
|
58 push(@blok,$radek);
|
|
59 } while (not $radek =~/^consensus/);
|
|
60 $cons_line = $radek;
|
|
61 $radek = <ALIGN>;
|
|
62 $radek = <ALIGN>; # toto posledni nacteni radku slouzi v podmince tohoto i nadrazeneho cyklu while
|
|
63 if (not $radek) { $radek = " "; }; # aby to nehlasilo chyby na konci souboru kde ty radky muzou chybet
|
|
64
|
|
65 pop(@blok); pop(@blok); # odstrani posledni dva radky (mezera a consensus)
|
|
66
|
|
67 for ($f=10;$f<=length($cons_line);$f++) {
|
|
68 $suma_pozice = 0;
|
|
69 if (substr($cons_line,$f,1) =~/([A,T,G,C,N])/) {
|
|
70 $seq .= $1;
|
|
71 foreach $cteni (@blok) {
|
|
72 if (substr($cteni,$f,1) =~/[A,T,G,C,N]/) {
|
|
73 $suma_pozice++;
|
|
74 }
|
|
75 }
|
|
76 push(@pokryti,$suma_pozice);
|
|
77 }
|
|
78 }
|
|
79
|
|
80 }
|
|
81
|
|
82 $delka_cons = @pokryti;
|
|
83 $soucet = 0;
|
|
84 $prumer = 0;
|
|
85 foreach $suma_pozice (@pokryti) {
|
|
86 $soucet += $suma_pozice;
|
|
87 }
|
|
88 $prumer = sprintf("%0.1f",$soucet/$delka_cons);
|
|
89
|
|
90 print OUT ">$contig_id ($delka_cons-$prumer-$soucet)\n";
|
|
91 while ($seq_line = substr($seq,0,60,"")) {
|
|
92 print OUT "$seq_line\n";
|
|
93 }
|
|
94 if ($profile_file) {
|
|
95 print PROF ">$contig_id ($delka_cons-$prumer-$soucet)\n";
|
|
96 foreach $suma_pozice (@pokryti) {
|
|
97 print PROF "$suma_pozice ";
|
|
98 }
|
|
99 print PROF "\n";
|
|
100 }
|
|
101
|
|
102 }
|
|
103 }
|
|
104 }
|
|
105
|
|
106 close ALIGN;
|
|
107 close OUT;
|
|
108 if ($profile_file) {
|
|
109 close PROF;
|
|
110 }
|
|
111
|