Mercurial > repos > petr-novak > repeatrxplorer
view bin/select_and_sort_contigs.pl @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl # Selects contigs exceeding the specified RD # and sort them based on their properties # which have to be provided in FASTA ID line: # >CLXContigY (lenght-RD-lenght*RD) # (RD = average Read Depth of the contig) # (GR = genome representation (also marked as "pxd") = lenght*RD) # # ver. 03 - requires input filename {fn}.info.fasta # - output saved to {fn}.info.minRDxxx.fasta # use warnings; $jm_contigs = $ARGV[0]; if ($ARGV[1]) { $min_pokryti = $ARGV[1]; } else { die "Missing argument - minRD\n"; } if ($jm_contigs =~/(\S+).fasta$/) { $jm_vystup_zakladni = $1.".minRD".$min_pokryti; } else { $jm_vystup_zakladni = $jm_contigs.".minRD".$min_pokryti; } $jm_vystup_sort_RD = $jm_vystup_zakladni."_sort-RD.fasta"; # min. read depth $jm_vystup_sort_delka = $jm_vystup_zakladni."_sort-length.fasta"; $jm_vystup_sort_pxd = $jm_vystup_zakladni."_sort-GR.fasta"; %pokryti = (); # prum. pokryti contigu ctenimi %delka_cont = (); # delka contigu (jeho consensu) %pxd = (); # pxd = pokryti x delka %sekvence = (); # sekvence contigu open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n"; while ($radek = <CONT>) { if ($radek =~/^>(CL\d+Contig\d+) \((\d+)\-(\S+)\-(\d+)\)/) { if ($3>=$min_pokryti) { # print "$1 : delka $2, prum. pokryti $3, pxd $4\n"; $delka_cont{$1} = $2; $pokryti{$1} = $3; $pxd{$1} = $4; } } } close CONT; # ze souboru contigu se nactou sekvence tech, vybranych v predchozi fazi open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n"; $ukladat = 0; while ($radek = <CONT>) { if ($radek =~/>(CL\d+Contig\d+)/) { if (exists($pokryti{$1})) { $ukladat = 1; $jmeno = $1; } else { $ukladat = 0; } } elsif ($ukladat) { $sekvence{$jmeno} .= $radek; } } close CONT; # vygeneruji se soubory, kde jsou contigy co prosly pres >= $min_pokryti setrideny # podle pokryti, delky, nebo delky x pokryti open (VYSTUP, ">$jm_vystup_sort_RD") or die "Cannot write to $jm_vystup_sort_RD\n"; foreach $klic (sort {$pokryti{$b} <=> $pokryti{$a}} keys(%pokryti)) { print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; print VYSTUP "$sekvence{$klic}"; } close VYSTUP; open (VYSTUP, ">$jm_vystup_sort_delka") or die "Cannot write to $jm_vystup_sort_delka\n"; foreach $klic (sort {$delka_cont{$b} <=> $delka_cont{$a}} keys(%delka_cont)) { print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; print VYSTUP "$sekvence{$klic}"; } close VYSTUP; open (VYSTUP, ">$jm_vystup_sort_pxd") or die "Cannot write to $jm_vystup_sort_pxd\n"; foreach $klic (sort {$pxd{$b} <=> $pxd{$a}} keys(%pxd)) { print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; print VYSTUP "$sekvence{$klic}"; } close VYSTUP; $celkem_pxd = 0; foreach $hodnota (values(%pxd)) { # print "H: $hodnota\n"; $celkem_pxd += $hodnota; } print "\nTotal GR: $celkem_pxd\n\n";