Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
diff scripts/ReMatCh/utils/strip_alignment.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
line wrap: on
line diff
--- a/scripts/ReMatCh/utils/strip_alignment.py Wed Jan 22 09:10:12 2020 -0500 +++ b/scripts/ReMatCh/utils/strip_alignment.py Tue Jan 28 10:42:31 2020 -0500 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- @@ -7,9 +7,9 @@ missing data and invariable positions <https://github.com/B-UMMI/ReMatCh/> -Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> +Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt> -Last modified: March 20, 2017 +Last modified: October 15, 2018 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,17 +31,17 @@ import sys -version = '0.1' +version = '0.2' def get_sequences(infile): - print 'Getting sequences' - sequences_SeqIO = list(SeqIO.parse(infile, 'fasta')) + print('Getting sequences') + sequences_seq_io = list(SeqIO.parse(infile, 'fasta')) sequence_length = None sequences_dict = {} all_executed_printed = False - for x, sequence in enumerate(sequences_SeqIO): + for x, sequence in enumerate(sequences_seq_io): if sequence_length is None: sequence_length = len(sequence.seq) if sequence_length != len(sequence.seq): @@ -49,17 +49,17 @@ sequences_dict[sequence.id] = list(sequence.seq) if (x + 1) % 10 == 0: - print '\n' + str(round((float(x + 1) / len(sequences_SeqIO)) * 100, 2)) + '% of sequences already processed (getting sequences)' - if x + 1 == len(sequences_SeqIO): + print('\n' + str(round((float(x + 1) / len(sequences_seq_io)) * 100, 2)) + '% of sequences already processed (getting sequences)') + if x + 1 == len(sequences_seq_io): all_executed_printed = True if not all_executed_printed: - print '\n' + str(round((float(x + 1) / len(sequences_SeqIO)) * 100, 2)) + '% of sequences already processed (getting sequences)' + print('\n' + str(round((float(x + 1) / len(sequences_seq_io)) * 100, 2)) + '% of sequences already processed (getting sequences)') return sequences_dict, sequence_length -def positions_type(sequences_dict, sequence_length, notGAPs, notMissing, notInvariable): - print 'Determining positions type' +def positions_type(sequences_dict, sequence_length, not_gaps, not_missing, not_invariable): + print('Determining positions type') positions_2_keep = [] invariable = [] missing = [] @@ -83,28 +83,32 @@ positions_2_keep.append(i) if (i + 1) % 10000 == 0: - print '\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already processed (determining positions type)' + print('\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already' + ' processed (determining positions' + ' type)') if i + 1 == len(sequences_dict): all_executed_printed = True if not all_executed_printed: - print '\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already processed (determining positions type)' + print('\n' + str(round((float(i + 1) / sequence_length) * 100, 2)) + '% of positions already' + ' processed (determining positions' + ' type)') - print 'Positions to keep (no matter): ' + str(len(positions_2_keep)) - print 'Invariable sites: ' + str(len(invariable)) - print 'Positions with missing data ("N"): ' + str(len(missing)) - print 'Positions with GAPs ("-"): ' + str(len(gaps)) - print 'Positions with GAPs or missing data: ' + str(gaps_missing) + print('Positions to keep (no matter): ' + str(len(positions_2_keep))) + print('Invariable sites: ' + str(len(invariable))) + print('Positions with missing data ("N"): ' + str(len(missing))) + print('Positions with GAPs ("-"): ' + str(len(gaps))) + print('Positions with GAPs or missing data: ' + str(gaps_missing)) - if notGAPs: + if not_gaps: positions_2_keep.extend(gaps) - if notMissing: + if not_missing: positions_2_keep.extend(missing) - if notInvariable: + if not_invariable: positions_2_keep.extend(invariable) positions_2_keep = sorted(set(positions_2_keep)) - print 'Positions to keep (final): ' + str(len(positions_2_keep)) + print('Positions to keep (final): ' + str(len(positions_2_keep))) return positions_2_keep @@ -114,7 +118,7 @@ def write_fasta(sequences_dict, positions_2_keep, outfile): - print 'Writing stripped sequences' + print('Writing stripped sequences') all_executed_printed = False with open(outfile, 'wt') as writer: for x, sample in enumerate(sequences_dict): @@ -124,11 +128,15 @@ writer.write(line + '\n') if (x + 1) % 100 == 0: - print '\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already processed (writing stripped sequences)' + print('\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already' + ' processed (writing stripped' + ' sequences)') if x + 1 == len(sequences_dict): all_executed_printed = True if not all_executed_printed: - print '\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already processed (writing stripped sequences)' + print('\n' + str(round((float(x + 1) / len(sequences_dict)) * 100, 2)) + '% of sequences already' + ' processed (writing stripped' + ' sequences)') def strip_alignment(args): @@ -141,21 +149,29 @@ infile = os.path.abspath(args.infile.name) sequences_dict, sequence_length = get_sequences(infile) - positions_2_keep = positions_type(sequences_dict, sequence_length, args.notGAPs, args.notMissing, args.notInvariable) + positions_2_keep = positions_type(sequences_dict, sequence_length, args.notGAPs, args.notMissing, + args.notInvariable) write_fasta(sequences_dict, positions_2_keep, outfile) def main(): - parser = argparse.ArgumentParser(prog='strip_alignment.py', description='Strip alignment positions containing gaps, missing data and invariable positions', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(prog='strip_alignment.py', + description='Strip alignment positions containing gaps, missing data and' + ' invariable positions', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_required = parser.add_argument_group('Required options') - parser_required.add_argument('-i', '--infile', type=argparse.FileType('r'), metavar='/path/to/aligned/input/file.fasta', help='Path to the aligned fasta file', required=True) - parser_required.add_argument('-o', '--outfile', type=str, metavar='/path/to/stripped/output/file.fasta', help='Stripped output fasta file', required=True, default='alignment_stripped.fasta') + parser_required.add_argument('-i', '--infile', type=argparse.FileType('r'), + metavar='/path/to/aligned/input/file.fasta', help='Path to the aligned fasta file', + required=True) + parser_required.add_argument('-o', '--outfile', type=str, metavar='/path/to/stripped/output/file.fasta', + help='Stripped output fasta file', required=True, default='alignment_stripped.fasta') parser_optional_general = parser.add_argument_group('General facultative options') parser_optional_general.add_argument('--notGAPs', action='store_true', help='Not strip positions with GAPs') - parser_optional_general.add_argument('--notMissing', action='store_true', help='Not strip positions with missing data') + parser_optional_general.add_argument('--notMissing', action='store_true', + help='Not strip positions with missing data') parser_optional_general.add_argument('--notInvariable', action='store_true', help='Not strip invariable sites') args = parser.parse_args()