annotate bin/select_and_sort_contigs.pl @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env perl
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 # Selects contigs exceeding the specified RD
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 # and sort them based on their properties
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 # which have to be provided in FASTA ID line:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 # >CLXContigY (lenght-RD-lenght*RD)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 # (RD = average Read Depth of the contig)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 # (GR = genome representation (also marked as "pxd") = lenght*RD)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 #
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 # ver. 03 - requires input filename {fn}.info.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 # - output saved to {fn}.info.minRDxxx.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 #
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 use warnings;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 $jm_contigs = $ARGV[0];
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 if ($ARGV[1]) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 $min_pokryti = $ARGV[1];
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 } else {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 die "Missing argument - minRD\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21 if ($jm_contigs =~/(\S+).fasta$/) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 $jm_vystup_zakladni = $1.".minRD".$min_pokryti;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 } else {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 $jm_vystup_zakladni = $jm_contigs.".minRD".$min_pokryti;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26 $jm_vystup_sort_RD = $jm_vystup_zakladni."_sort-RD.fasta"; # min. read depth
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27 $jm_vystup_sort_delka = $jm_vystup_zakladni."_sort-length.fasta";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 $jm_vystup_sort_pxd = $jm_vystup_zakladni."_sort-GR.fasta";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 %pokryti = (); # prum. pokryti contigu ctenimi
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 %delka_cont = (); # delka contigu (jeho consensu)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 %pxd = (); # pxd = pokryti x delka
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 %sekvence = (); # sekvence contigu
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 while ($radek = <CONT>) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 if ($radek =~/^>(CL\d+Contig\d+) \((\d+)\-(\S+)\-(\d+)\)/) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 if ($3>=$min_pokryti) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 # print "$1 : delka $2, prum. pokryti $3, pxd $4\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 $delka_cont{$1} = $2;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 $pokryti{$1} = $3;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 $pxd{$1} = $4;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 close CONT;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 # ze souboru contigu se nactou sekvence tech, vybranych v predchozi fazi
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 $ukladat = 0;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 while ($radek = <CONT>) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 if ($radek =~/>(CL\d+Contig\d+)/) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 if (exists($pokryti{$1})) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 $ukladat = 1;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 $jmeno = $1;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 } else {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 $ukladat = 0;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 } elsif ($ukladat) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 $sekvence{$jmeno} .= $radek;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 close CONT;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 # vygeneruji se soubory, kde jsou contigy co prosly pres >= $min_pokryti setrideny
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 # podle pokryti, delky, nebo delky x pokryti
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 open (VYSTUP, ">$jm_vystup_sort_RD") or die "Cannot write to $jm_vystup_sort_RD\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 foreach $klic (sort {$pokryti{$b} <=> $pokryti{$a}} keys(%pokryti)) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 print VYSTUP "$sekvence{$klic}";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 close VYSTUP;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73 open (VYSTUP, ">$jm_vystup_sort_delka") or die "Cannot write to $jm_vystup_sort_delka\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 foreach $klic (sort {$delka_cont{$b} <=> $delka_cont{$a}} keys(%delka_cont)) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 print VYSTUP "$sekvence{$klic}";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 close VYSTUP;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 open (VYSTUP, ">$jm_vystup_sort_pxd") or die "Cannot write to $jm_vystup_sort_pxd\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 foreach $klic (sort {$pxd{$b} <=> $pxd{$a}} keys(%pxd)) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 print VYSTUP "$sekvence{$klic}";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 close VYSTUP;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 $celkem_pxd = 0;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 foreach $hodnota (values(%pxd)) {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 # print "H: $hodnota\n";
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 $celkem_pxd += $hodnota;
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 print "\nTotal GR: $celkem_pxd\n\n";