Mercurial > repos > petr-novak > long_reads_sampling
comparison long_reads_sampling.py @ 0:dd46956ff61f draft
Uploaded
| author | petr-novak |
|---|---|
| date | Fri, 08 Dec 2017 09:57:17 -0500 |
| parents | |
| children | ccaedca97e5e |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dd46956ff61f |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 import sys | |
| 3 import argparse | |
| 4 import random | |
| 5 from argparse import ArgumentDefaultsHelpFormatter | |
| 6 from Bio import SeqIO | |
| 7 from long2short import get_sequences_summary | |
| 8 | |
| 9 | |
| 10 def get_args(): | |
| 11 '''Parses command line arguments ''' | |
| 12 description = ("Create sample of long reads, instead" | |
| 13 " of setting number of reads to be sampled," | |
| 14 "total length of all sampled sequences is defined") | |
| 15 | |
| 16 parser = argparse.ArgumentParser( | |
| 17 description=description, | |
| 18 formatter_class=ArgumentDefaultsHelpFormatter) | |
| 19 parser.add_argument('-i', | |
| 20 '--input', | |
| 21 type=argparse.FileType('r'), | |
| 22 help="file with long reads in fasta format") | |
| 23 parser.add_argument('-o', | |
| 24 '--output', | |
| 25 type=argparse.FileType('w'), | |
| 26 help="Output file name") | |
| 27 parser.add_argument("-l", | |
| 28 "--total_length", | |
| 29 type=int, | |
| 30 help="total length of sampled output") | |
| 31 | |
| 32 parser.add_argument("-s", | |
| 33 "--seed", | |
| 34 type=int, | |
| 35 default=123, | |
| 36 help="random number generator seed") | |
| 37 | |
| 38 args = parser.parse_args() | |
| 39 if len(sys.argv) == 1: | |
| 40 parser.print_help() | |
| 41 sys.exit(1) | |
| 42 | |
| 43 return args | |
| 44 | |
| 45 | |
| 46 def make_sequence_sample(args, seq_summary): | |
| 47 ''' | |
| 48 create output sequence of required length or larger, save to fasta | |
| 49 ''' | |
| 50 random.seed(args.seed) | |
| 51 print(args.seed) | |
| 52 selected_seq_id = set() | |
| 53 cummulative_length = 0 | |
| 54 for seq_id in random.sample(seq_summary.id_length.keys(), | |
| 55 seq_summary.number_of_sequence): | |
| 56 selected_seq_id.add(seq_id) | |
| 57 cummulative_length += seq_summary.id_length[seq_id] | |
| 58 if cummulative_length >= args.total_length: | |
| 59 break | |
| 60 print(selected_seq_id) | |
| 61 # to make it efficient same orger is necesary | |
| 62 selected_seq_id_ordered = [i | |
| 63 for i in seq_summary.id_length | |
| 64 if i in selected_seq_id] | |
| 65 curent_id = selected_seq_id_ordered.pop(0) | |
| 66 with open(args.output.name, 'w') as output_seq_file: | |
| 67 for seqs in SeqIO.parse(args.input.name, "fasta"): | |
| 68 if seqs.id == curent_id: | |
| 69 SeqIO.write(seqs, output_seq_file, "fasta") | |
| 70 try: | |
| 71 curent_id = selected_seq_id_ordered.pop(0) | |
| 72 except IndexError: | |
| 73 break | |
| 74 | |
| 75 | |
| 76 def main(): | |
| 77 ''' | |
| 78 Create sample of longn reads, sample size is defined as total length | |
| 79 ''' | |
| 80 args = get_args() | |
| 81 seq_summary = get_sequences_summary(args.input) | |
| 82 # check that total length if larger than required sampling | |
| 83 if seq_summary.total_length < args.total_length: | |
| 84 print("Input sequences total length: ", | |
| 85 seq_summary.total_length, | |
| 86 file=sys.stderr) | |
| 87 print("Required output total length: ", | |
| 88 args.total_length, | |
| 89 file=sys.stderr) | |
| 90 print("Required sequence length is larger than input data, exiting", | |
| 91 file=sys.stderr) | |
| 92 sys.exit(1) | |
| 93 | |
| 94 make_sequence_sample(args, seq_summary) | |
| 95 | |
| 96 | |
| 97 if __name__ == "__main__": | |
| 98 main() |
