Mercurial > repos > mvdbeek > mismatch_frequencies
annotate mismatch_frequencies.py @ 2:2974c382105c draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
author | mvdbeek |
---|---|
date | Sat, 22 Dec 2018 04:15:47 -0500 |
parents | 77de5fc623f9 |
children |
rev | line source |
---|---|
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
1 import re |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
2 import string |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
3 import pysam |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
4 import matplotlib |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
5 import pandas as pd |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
6 from collections import defaultdict |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
7 from collections import OrderedDict |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
8 import argparse |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
9 import itertools |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
10 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
11 matplotlib.use('pdf') |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
12 import matplotlib.pyplot as plt # noqa: E402 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
13 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
14 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
15 class MismatchFrequencies: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
16 '''Iterate over a SAM/BAM alignment file, collecting reads with mismatches. One |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
17 class instance per alignment file. The result_dict attribute will contain a |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
18 nested dictionary with name, readlength and mismatch count.''' |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
19 def __init__(self, result_dict={}, alignment_file=None, name="name", minimal_readlength=21, |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
20 maximal_readlength=21, |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
21 number_of_allowed_mismatches=1, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
22 ignore_5p_nucleotides=0, |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
23 ignore_3p_nucleotides=0, |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
24 possible_mismatches=[ |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
25 'AC', 'AG', 'AT', |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
26 'CA', 'CG', 'CT', |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
27 'GA', 'GC', 'GT', |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
28 'TA', 'TC', 'TG' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
29 ]): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
30 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
31 self.result_dict = result_dict |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
32 self.name = name |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
33 self.minimal_readlength = minimal_readlength |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
34 self.maximal_readlength = maximal_readlength |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
35 self.number_of_allowed_mismatches = number_of_allowed_mismatches |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
36 self.ignore_5p_nucleotides = ignore_5p_nucleotides |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
37 self.ignore_3p_nucleotides = ignore_3p_nucleotides |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
38 self.possible_mismatches = possible_mismatches |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
39 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
40 if alignment_file: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
41 self.pysam_alignment = pysam.Samfile(alignment_file) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
42 self.references = self.pysam_alignment.references # names of fasta reference sequences |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
43 result_dict[name] = self.get_mismatches( |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
44 self.pysam_alignment, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
45 minimal_readlength, |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
46 maximal_readlength, |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
47 possible_mismatches |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
48 ) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
49 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
50 def get_mismatches(self, pysam_alignment, minimal_readlength, |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
51 maximal_readlength, possible_mismatches): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
52 rec_dd = lambda: defaultdict(rec_dd) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
53 len_dict = rec_dd() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
54 for alignedread in pysam_alignment: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
55 if self.read_is_valid(alignedread, minimal_readlength, maximal_readlength): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
56 chromosome = pysam_alignment.getrname(alignedread.rname) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
57 try: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
58 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] += 1 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
59 except TypeError: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
60 len_dict[int(alignedread.rlen)][chromosome]['total valid reads'] = 1 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
61 MD = alignedread.opt('MD') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
62 if self.read_has_mismatch(alignedread, self.number_of_allowed_mismatches): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
63 (ref_base, mismatch_base) = self.read_to_reference_mismatch(MD, alignedread.seq, alignedread.is_reverse) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
64 if not ref_base: |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
65 continue |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
66 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
67 for i, base in enumerate(ref_base): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
68 if not ref_base[i]+mismatch_base[i] in possible_mismatches: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
69 continue |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
70 try: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
71 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] += 1 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
72 except TypeError: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
73 len_dict[int(alignedread.rlen)][chromosome][ref_base[i]+mismatch_base[i]] = 1 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
74 return len_dict |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
75 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
76 def read_is_valid(self, read, min_readlength, max_readlength): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
77 '''Filter out reads that are unmatched, too short or |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
78 too long or that contian insertions''' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
79 if read.is_unmapped: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
80 return False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
81 if read.rlen < min_readlength: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
82 return False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
83 if read.rlen > max_readlength: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
84 return False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
85 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
86 return True |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
87 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
88 def read_has_mismatch(self, read, number_of_allowed_mismatches=1): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
89 '''keep only reads with one mismatch. Could be simplified''' |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
90 NM = read.opt('NM') |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
91 if NM < 1: # filter out reads with no mismatch |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
92 return False |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
93 if NM > number_of_allowed_mismatches: # filter out reads with more than 1 mismtach |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
94 return False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
95 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
96 return True |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
97 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
98 def mismatch_in_allowed_region(self, readseq, mismatch_position): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
99 ''' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
100 >>> M = MismatchFrequencies() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
101 >>> readseq = 'AAAAAA' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
102 >>> mismatch_position = 2 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
103 >>> M.mismatch_in_allowed_region(readseq, mismatch_position) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
104 True |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
105 >>> M = MismatchFrequencies(ignore_3p_nucleotides=2, ignore_5p_nucleotides=2) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
106 >>> readseq = 'AAAAAA' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
107 >>> mismatch_position = 1 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
108 >>> M.mismatch_in_allowed_region(readseq, mismatch_position) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
109 False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
110 >>> readseq = 'AAAAAA' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
111 >>> mismatch_position = 4 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
112 >>> M.mismatch_in_allowed_region(readseq, mismatch_position) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
113 False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
114 ''' |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
115 mismatch_position += 1 # To compensate for starting the count at 0 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
116 five_p = self.ignore_5p_nucleotides |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
117 three_p = self.ignore_3p_nucleotides |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
118 if any([five_p > 0, three_p > 0]): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
119 if any([mismatch_position <= five_p, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
120 mismatch_position >= (len(readseq) + 1 - three_p)]): # Again compensate for starting the count at 0 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
121 return False |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
122 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
123 return True |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
124 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
125 return True |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
126 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
127 def read_to_reference_mismatch(self, MD, readseq, is_reverse): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
128 ''' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
129 This is where the magic happens. The MD tag contains SNP and indel information, |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
130 without looking to the genome sequence. This is a typical MD tag: 3C0G2A6. |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
131 3 bases of the read align to the reference, followed by a mismatch, where the |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
132 reference base is C, followed by 10 bases aligned to the reference. |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
133 suppose a reference 'CTTCGATAATCCTT' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
134 ||| || |||||| |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
135 and a read 'CTTATATTATCCTT'. |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
136 This situation is represented by the above MD tag. |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
137 Given MD tag and read sequence this function returns the reference base C, G and A, |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
138 and the mismatched base A, T, T. |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
139 >>> M = MismatchFrequencies() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
140 >>> MD='3C0G2A7' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
141 >>> seq='CTTATATTATCCTT' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
142 >>> result=M.read_to_reference_mismatch(MD, seq, is_reverse=False) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
143 >>> result[0]=="CGA" |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
144 True |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
145 >>> result[1]=="ATT" |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
146 True |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
147 >>> |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
148 ''' |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
149 search = re.finditer('[ATGC]', MD) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
150 if '^' in MD: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
151 print 'WARNING insertion detected, mismatch calling skipped for this read!!!' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
152 return (None, None) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
153 start_index = 0 # refers to the leading integer of the MD string before an edited base |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
154 current_position = 0 # position of the mismatched nucleotide in the MD tag string |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
155 mismatch_position = 0 # position of edited base in current read |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
156 reference_base = "" |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
157 mismatched_base = "" |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
158 for result in search: |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
159 current_position = result.start() |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
160 mismatch_position = mismatch_position + 1 + int(MD[start_index:current_position]) # converts the leading characters before an edited base into integers |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
161 start_index = result.end() |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
162 reference_base += MD[result.end() - 1] |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
163 mismatched_base += readseq[mismatch_position - 1] |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
164 if is_reverse: |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
165 reference_base = reverseComplement(reference_base) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
166 mismatched_base = reverseComplement(mismatched_base) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
167 mismatch_position = len(readseq)-mismatch_position-1 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
168 if mismatched_base == 'N': |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
169 return (None, None) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
170 if self.mismatch_in_allowed_region(readseq, mismatch_position): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
171 return (reference_base, mismatched_base) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
172 else: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
173 return (None, None) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
174 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
175 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
176 def reverseComplement(sequence): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
177 '''do a reverse complement of DNA base. |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
178 >>> reverseComplement('ATGC')=='GCAT' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
179 True |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
180 >>> |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
181 ''' |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
182 sequence = sequence.upper() |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
183 complement = string.maketrans('ATCGN', 'TAGCN') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
184 return sequence.upper().translate(complement)[::-1] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
185 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
186 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
187 def barplot(df, library, axes): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
188 df.plot(kind='bar', ax=axes, subplots=False, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
189 stacked=False, legend='test', |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
190 title='Mismatch frequencies for {0}'.format(library)) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
191 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
192 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
193 def df_to_tab(df, output): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
194 df.to_csv(output, sep='\t') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
195 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
196 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
197 def reduce_result(df, possible_mismatches): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
198 '''takes a pandas dataframe with full mismatch details and |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
199 summarises the results for plotting.''' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
200 alignments = df['Alignment_file'].unique() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
201 readlengths = df['Readlength'].unique() |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
202 combinations = itertools.product(*[alignments, readlengths]) # generate all possible combinations of readlength and alignment files |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
203 reduced_dict = {} |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
204 last_column = 3 + len(possible_mismatches) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
205 for combination in combinations: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
206 library_subset = df[df['Alignment_file'] == combination[0]] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
207 library_readlength_subset = library_subset[library_subset['Readlength'] == combination[1]] |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
208 sum_of_library_and_readlength = library_readlength_subset.iloc[:, 3:last_column+1].sum() |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
209 if combination[0] not in reduced_dict: |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
210 reduced_dict[combination[0]] = {} |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
211 reduced_dict[combination[0]][combination[1]] = sum_of_library_and_readlength.to_dict() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
212 return reduced_dict |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
213 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
214 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
215 def plot_result(reduced_dict, args): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
216 names = reduced_dict.keys() |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
217 nrows = len(names) / 2 + 1 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
218 fig = plt.figure(figsize=(16, 32)) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
219 for i, library in enumerate(names): |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
220 axes = fig.add_subplot(nrows, 2, i+1) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
221 library_dict = reduced_dict[library] |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
222 df = pd.DataFrame(library_dict) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
223 df.drop(['total aligned reads'], inplace=True) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
224 barplot(df, library, axes), |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
225 axes.set_ylabel('Mismatch count / all valid reads * readlength') |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
226 fig.savefig(args.output_pdf, format='pdf') |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
227 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
228 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
229 def format_result_dict(result_dict, chromosomes, possible_mismatches): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
230 '''Turn nested dictionary into preformatted tab seperated lines''' |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
231 header = "Reference sequence\tAlignment_file\tReadlength\t" + "\t".join( |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
232 possible_mismatches) + "\ttotal aligned reads" |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
233 libraries = result_dict.keys() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
234 readlengths = result_dict[libraries[0]].keys() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
235 result = [] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
236 for chromosome in chromosomes: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
237 for library in libraries: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
238 for readlength in readlengths: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
239 line = [] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
240 line.extend([chromosome, library, readlength]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
241 try: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
242 line.extend([result_dict[library][readlength][chromosome].get(mismatch, 0) for mismatch in possible_mismatches]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
243 line.extend([result_dict[library][readlength][chromosome].get(u'total valid reads', 0)]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
244 except KeyError: |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
245 line.extend([0 for mismatch in possible_mismatches]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
246 line.extend([0]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
247 result.append(line) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
248 df = pd.DataFrame(result, columns=header.split('\t')) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
249 last_column = 3 + len(possible_mismatches) |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
250 df['mismatches/per aligned nucleotides'] = df.iloc[:, 3:last_column].sum(1)/(df.iloc[:, last_column] * df['Readlength']) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
251 return df |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
252 |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
253 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
254 def setup_MismatchFrequencies(args): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
255 resultDict = OrderedDict() |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
256 kw_list = [{'result_dict': resultDict, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
257 'alignment_file': alignment_file, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
258 'name': name, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
259 'minimal_readlength': args.min, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
260 'maximal_readlength': args.max, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
261 'number_of_allowed_mismatches': args.n_mm, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
262 'ignore_5p_nucleotides': args.five_p, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
263 'ignore_3p_nucleotides': args.three_p, |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
264 'possible_mismatches': args.possible_mismatches} |
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
265 for alignment_file, name in zip(args.input, args.name)] |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
266 return (kw_list, resultDict) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
267 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
268 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
269 def nested_dict_to_df(dictionary): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
270 dictionary = {(outerKey, innerKey): values for outerKey, innerDict in dictionary.iteritems() for innerKey, values in innerDict.iteritems()} |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
271 df = pd.DataFrame.from_dict(dictionary).transpose() |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
272 df.index.names = ['Library', 'Readlength'] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
273 return df |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
274 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
275 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
276 def run_MismatchFrequencies(args): |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
277 kw_list, resultDict = setup_MismatchFrequencies(args) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
278 references = [MismatchFrequencies(**kw_dict).references for kw_dict in kw_list] |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
279 return (resultDict, references[0]) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
280 |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
281 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
282 def main(): |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
283 result_dict, references = run_MismatchFrequencies(args) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
284 df = format_result_dict(result_dict, references, args.possible_mismatches) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
285 reduced_dict = reduce_result(df, args.possible_mismatches) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
286 plot_result(reduced_dict, args) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
287 reduced_df = nested_dict_to_df(reduced_dict) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
288 df_to_tab(reduced_df, args.output_tab) |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
289 if args.expanded_output_tab: |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
290 df_to_tab(df, args.expanded_output_tab) |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
291 return reduced_dict |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
292 |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
293 if __name__ == "__main__": |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
294 |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
295 parser = argparse.ArgumentParser(description='Produce mismatch statistics for BAM/SAM alignment files.') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
296 parser.add_argument('--input', nargs='*', help='Input files in SAM/BAM format') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
297 parser.add_argument('--name', nargs='*', help='Name for input file to display in output file. Should have same length as the number of inputs') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
298 parser.add_argument('--output_pdf', help='Output filename for graph') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
299 parser.add_argument('--output_tab', help='Output filename for table') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
300 parser.add_argument('--expanded_output_tab', default=None, help='Output filename for table') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
301 parser.add_argument('--possible_mismatches', default=[ |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
302 'AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG' |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
303 ], nargs='+', help='specify mismatches that should be counted for the mismatch frequency. The format is Reference base -> observed base, eg AG for A to G mismatches.') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
304 parser.add_argument('--min', '--minimal_readlength', type=int, help='minimum readlength') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
305 parser.add_argument('--max', '--maximal_readlength', type=int, help='maximum readlength') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
306 parser.add_argument('--n_mm', '--number_allowed_mismatches', type=int, default=1, help='discard reads with more than n mismatches') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
307 parser.add_argument('--five_p', '--ignore_5p_nucleotides', type=int, default=0, help='when calculating nucleotide mismatch frequencies ignore the first N nucleotides of the read') |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
308 parser.add_argument('--three_p', '--ignore_3p_nucleotides', type=int, default=1, help='when calculating nucleotide mismatch frequencies ignore the last N nucleotides of the read') |
2
2974c382105c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mismatch_frequencies commit 10a7e3877c2568d9c23de53fc97dc1c902ff0524-dirty
mvdbeek
parents:
0
diff
changeset
|
309 # args = parser.parse_args(['--input', '3mismatches_ago2ip_s2.bam', '3mismatches_ago2ip_ovary.bam','--possible_mismatches','AC','AG', 'CG', 'TG', 'CT','--name', 'Siomi1', 'Siomi2' , '--five_p', '3','--three_p','3','--output_pdf', 'out.pdf', '--output_tab', 'out.tab', '--expanded_output_tab', 'expanded.tab', '--min', '20', '--max', '22']) |
0
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
310 args = parser.parse_args() |
77de5fc623f9
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
diff
changeset
|
311 reduced_dict = main() |