comparison scripts/ReMatCh/utils/strip_alignment.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 2
3 # -*- coding: utf-8 -*- 3 # -*- coding: utf-8 -*-
4 4
5 """ 5 """
6 strip_alignment.py - Strip alignment positions containing gaps, 6 strip_alignment.py - Strip alignment positions containing gaps,
7 missing data and invariable positions 7 missing data and invariable positions
8 <https://github.com/B-UMMI/ReMatCh/> 8 <https://github.com/B-UMMI/ReMatCh/>
9 9
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> 10 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
11 11
12 Last modified: March 20, 2017 12 Last modified: October 15, 2018
13 13
14 This program is free software: you can redistribute it and/or modify 14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by 15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or 16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version. 17 (at your option) any later version.
29 import os 29 import os
30 import argparse 30 import argparse
31 import sys 31 import sys
32 32
33 33
34 version = '0.1' 34 version = '0.2'
35 35
36 36
37 def get_sequences(infile): 37 def get_sequences(infile):
38 print 'Getting sequences' 38 print('Getting sequences')
39 sequences_SeqIO = list(SeqIO.parse(infile, 'fasta')) 39 sequences_seq_io = list(SeqIO.parse(infile, 'fasta'))
40 40
41 sequence_length = None 41 sequence_length = None
42 sequences_dict = {} 42 sequences_dict = {}
43 all_executed_printed = False 43 all_executed_printed = False
44 for x, sequence in enumerate(sequences_SeqIO): 44 for x, sequence in enumerate(sequences_seq_io):
45 if sequence_length is None: 45 if sequence_length is None:
46 sequence_length = len(sequence.seq) 46 sequence_length = len(sequence.seq)
47 if sequence_length != len(sequence.seq): 47 if sequence_length != len(sequence.seq):
48 sys.exit('Sequences with different length!') 48 sys.exit('Sequences with different length!')
49 sequences_dict[sequence.id] = list(sequence.seq) 49 sequences_dict[sequence.id] = list(sequence.seq)
50 50
51 if (x + 1) % 10 == 0: 51 if (x + 1) % 10 == 0:
52 print '\n' + str(round((float(x + 1) / len(sequences_SeqIO)) * 100, 2)) + '% of sequences already processed (getting sequences)' 52 print('\n' + str(round((float(x + 1) / len(sequences_seq_io)) * 100, 2)) + '% of sequences already processed (getting sequences)')
53 if x + 1 == len(sequences_SeqIO): 53 if x + 1 == len(sequences_seq_io):
54 all_executed_printed = True 54 all_executed_printed = True
55 if not all_executed_printed: 55 if not all_executed_printed:
56 print '\n' + str(round((float(x + 1) / len(sequences_SeqIO)) * 100, 2)) + '% of sequences already processed (getting sequences)' 56 print('\n' + str(round((float(x + 1) / len(sequences_seq_io)) * 100, 2)) + '% of sequences already processed (getting sequences)')
57 57
58 return sequences_dict, sequence_length 58 return sequences_dict, sequence_length
59 59
60 60
61 def positions_type(sequences_dict, sequence_length, notGAPs, notMissing, notInvariable): 61 def positions_type(sequences_dict, sequence_length, not_gaps, not_missing, not_invariable):
62 print 'Determining positions type' 62 print('Determining positions type')
63 positions_2_keep = [] 63 positions_2_keep = []
64 invariable = [] 64 invariable = []
65 missing = [] 65 missing = []
66 gaps = [] 66 gaps = []
67 gaps_missing = 0 67 gaps_missing = 0
81 gaps_missing += 1 81 gaps_missing += 1
82 if len(possibilities) > 1 and len(possibilities.intersection(set(['N', '-']))) == 0: 82 if len(possibilities) > 1 and len(possibilities.intersection(set(['N', '-']))) == 0:
83 positions_2_keep.append(i) 83 positions_2_keep.append(i)
84 84
85 if (i + 1) % 10000 == 0: 85 if (i + 1) % 10000 == 0:
86 print '\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already processed (determining positions type)' 86 print('\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already'
87 ' processed (determining positions'
88 ' type)')
87 if i + 1 == len(sequences_dict): 89 if i + 1 == len(sequences_dict):
88 all_executed_printed = True 90 all_executed_printed = True
89 if not all_executed_printed: 91 if not all_executed_printed:
90 print '\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already processed (determining positions type)' 92 print('\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already'
93 ' processed (determining positions'
94 ' type)')
91 95
92 print 'Positions to keep (no matter): ' + str(len(positions_2_keep)) 96 print('Positions to keep (no matter): ' + str(len(positions_2_keep)))
93 print 'Invariable sites: ' + str(len(invariable)) 97 print('Invariable sites: ' + str(len(invariable)))
94 print 'Positions with missing data ("N"): ' + str(len(missing)) 98 print('Positions with missing data ("N"): ' + str(len(missing)))
95 print 'Positions with GAPs ("-"): ' + str(len(gaps)) 99 print('Positions with GAPs ("-"): ' + str(len(gaps)))
96 print 'Positions with GAPs or missing data: ' + str(gaps_missing) 100 print('Positions with GAPs or missing data: ' + str(gaps_missing))
97 101
98 if notGAPs: 102 if not_gaps:
99 positions_2_keep.extend(gaps) 103 positions_2_keep.extend(gaps)
100 if notMissing: 104 if not_missing:
101 positions_2_keep.extend(missing) 105 positions_2_keep.extend(missing)
102 if notInvariable: 106 if not_invariable:
103 positions_2_keep.extend(invariable) 107 positions_2_keep.extend(invariable)
104 108
105 positions_2_keep = sorted(set(positions_2_keep)) 109 positions_2_keep = sorted(set(positions_2_keep))
106 110
107 print 'Positions to keep (final): ' + str(len(positions_2_keep)) 111 print('Positions to keep (final): ' + str(len(positions_2_keep)))
108 112
109 return positions_2_keep 113 return positions_2_keep
110 114
111 115
112 def chunkstring(string, length): 116 def chunkstring(string, length):
113 return (string[0 + i:length + i] for i in range(0, len(string), length)) 117 return (string[0 + i:length + i] for i in range(0, len(string), length))
114 118
115 119
116 def write_fasta(sequences_dict, positions_2_keep, outfile): 120 def write_fasta(sequences_dict, positions_2_keep, outfile):
117 print 'Writing stripped sequences' 121 print('Writing stripped sequences')
118 all_executed_printed = False 122 all_executed_printed = False
119 with open(outfile, 'wt') as writer: 123 with open(outfile, 'wt') as writer:
120 for x, sample in enumerate(sequences_dict): 124 for x, sample in enumerate(sequences_dict):
121 writer.write('>' + sample + '\n') 125 writer.write('>' + sample + '\n')
122 fasta_sequence_lines = chunkstring(''.join([sequences_dict[sample][i] for i in positions_2_keep]), 80) 126 fasta_sequence_lines = chunkstring(''.join([sequences_dict[sample][i] for i in positions_2_keep]), 80)
123 for line in fasta_sequence_lines: 127 for line in fasta_sequence_lines:
124 writer.write(line + '\n') 128 writer.write(line + '\n')
125 129
126 if (x + 1) % 100 == 0: 130 if (x + 1) % 100 == 0:
127 print '\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already processed (writing stripped sequences)' 131 print('\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already'
132 ' processed (writing stripped'
133 ' sequences)')
128 if x + 1 == len(sequences_dict): 134 if x + 1 == len(sequences_dict):
129 all_executed_printed = True 135 all_executed_printed = True
130 if not all_executed_printed: 136 if not all_executed_printed:
131 print '\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already processed (writing stripped sequences)' 137 print('\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already'
138 ' processed (writing stripped'
139 ' sequences)')
132 140
133 141
134 def strip_alignment(args): 142 def strip_alignment(args):
135 outdir = os.path.dirname(os.path.abspath(args.outfile)) 143 outdir = os.path.dirname(os.path.abspath(args.outfile))
136 if not os.path.isdir(outdir): 144 if not os.path.isdir(outdir):
139 outfile = os.path.abspath(args.outfile) 147 outfile = os.path.abspath(args.outfile)
140 148
141 infile = os.path.abspath(args.infile.name) 149 infile = os.path.abspath(args.infile.name)
142 150
143 sequences_dict, sequence_length = get_sequences(infile) 151 sequences_dict, sequence_length = get_sequences(infile)
144 positions_2_keep = positions_type(sequences_dict, sequence_length, args.notGAPs, args.notMissing, args.notInvariable) 152 positions_2_keep = positions_type(sequences_dict, sequence_length, args.notGAPs, args.notMissing,
153 args.notInvariable)
145 write_fasta(sequences_dict, positions_2_keep, outfile) 154 write_fasta(sequences_dict, positions_2_keep, outfile)
146 155
147 156
148 def main(): 157 def main():
149 parser = argparse.ArgumentParser(prog='strip_alignment.py', description='Strip alignment positions containing gaps, missing data and invariable positions', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 158 parser = argparse.ArgumentParser(prog='strip_alignment.py',
159 description='Strip alignment positions containing gaps, missing data and'
160 ' invariable positions',
161 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
150 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) 162 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
151 163
152 parser_required = parser.add_argument_group('Required options') 164 parser_required = parser.add_argument_group('Required options')
153 parser_required.add_argument('-i', '--infile', type=argparse.FileType('r'), metavar='/path/to/aligned/input/file.fasta', help='Path to the aligned fasta file', required=True) 165 parser_required.add_argument('-i', '--infile', type=argparse.FileType('r'),
154 parser_required.add_argument('-o', '--outfile', type=str, metavar='/path/to/stripped/output/file.fasta', help='Stripped output fasta file', required=True, default='alignment_stripped.fasta') 166 metavar='/path/to/aligned/input/file.fasta', help='Path to the aligned fasta file',
167 required=True)
168 parser_required.add_argument('-o', '--outfile', type=str, metavar='/path/to/stripped/output/file.fasta',
169 help='Stripped output fasta file', required=True, default='alignment_stripped.fasta')
155 170
156 parser_optional_general = parser.add_argument_group('General facultative options') 171 parser_optional_general = parser.add_argument_group('General facultative options')
157 parser_optional_general.add_argument('--notGAPs', action='store_true', help='Not strip positions with GAPs') 172 parser_optional_general.add_argument('--notGAPs', action='store_true', help='Not strip positions with GAPs')
158 parser_optional_general.add_argument('--notMissing', action='store_true', help='Not strip positions with missing data') 173 parser_optional_general.add_argument('--notMissing', action='store_true',
174 help='Not strip positions with missing data')
159 parser_optional_general.add_argument('--notInvariable', action='store_true', help='Not strip invariable sites') 175 parser_optional_general.add_argument('--notInvariable', action='store_true', help='Not strip invariable sites')
160 176
161 args = parser.parse_args() 177 args = parser.parse_args()
162 178
163 strip_alignment(args) 179 strip_alignment(args)