Mercurial > repos > petr-novak > long_reads_sampling
comparison long2short.py @ 0:dd46956ff61f draft
Uploaded
| author | petr-novak |
|---|---|
| date | Fri, 08 Dec 2017 09:57:17 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dd46956ff61f |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 import argparse | |
| 3 import sys | |
| 4 from argparse import ArgumentDefaultsHelpFormatter | |
| 5 from collections import namedtuple | |
| 6 from collections import OrderedDict | |
| 7 from Bio import SeqIO | |
| 8 | |
| 9 SEQ_FORMAT = "fasta" | |
| 10 # Default sampling, used on argparse: | |
| 11 DEFAULT_READ_LENGTH = 200 | |
| 12 DEFAULT_INSERT_LENGTH = 700 | |
| 13 DEFAULT_COVERAGE = 0.1 | |
| 14 | |
| 15 Sequences_summary = namedtuple('Fasta_summary', | |
| 16 ['total_length', 'number_of_sequence', | |
| 17 'id_length', 'file_path', 'format']) | |
| 18 | |
| 19 Coordinates = namedtuple('Coordinates', "id start1 end1 start2 end2") | |
| 20 | |
| 21 | |
| 22 def get_sequences_summary(seq_file): | |
| 23 ''' return basic characteristic of sequences ''' | |
| 24 id_length = OrderedDict() | |
| 25 totat_length = 0 | |
| 26 N = 0 | |
| 27 for seqs in SeqIO.parse(seq_file, SEQ_FORMAT): | |
| 28 id_length[seqs.id] = len(seqs) | |
| 29 totat_length += len(seqs) | |
| 30 N += 1 | |
| 31 return Sequences_summary(totat_length, N, id_length, seq_file, SEQ_FORMAT) | |
| 32 | |
| 33 | |
| 34 def get_short_pseudoreads_position(fasta_summary, sampling_options): | |
| 35 """Return selected position on long read sequences | |
| 36 Arguments: | |
| 37 fasta_summary - namedtuple Fasta_summaty containing information about sequences | |
| 38 sampling options - namedtuple, specified how sequences should be sampled | |
| 39 Return value: | |
| 40 (sequence_id, start1, end1, start2, end2) | |
| 41 """ | |
| 42 interval = int(2 * sampling_options.read_length / | |
| 43 sampling_options.coverage) | |
| 44 for seqname, length in fasta_summary.id_length.items(): | |
| 45 start_positions = range(1, length, interval) | |
| 46 for s in start_positions: | |
| 47 yield Coordinates(seqname, s, s + sampling_options.read_length, | |
| 48 s + sampling_options.insert_length - | |
| 49 sampling_options.read_length, | |
| 50 s + sampling_options.insert_length) | |
| 51 | |
| 52 | |
| 53 def extract_short_reads(summary, args): | |
| 54 '''yield short reades sampled from long reads | |
| 55 Arguments: | |
| 56 summary.. named tuple specifie sequences properties, path, length, idslist | |
| 57 args ..... Define how short sequences should be generated | |
| 58 ''' | |
| 59 pos = get_short_pseudoreads_position(summary, args) | |
| 60 coords = next(pos) | |
| 61 index = 0 | |
| 62 for i in SeqIO.parse(summary.file_path, summary.format): | |
| 63 index += 1 | |
| 64 while True: | |
| 65 if coords.id == i.id: | |
| 66 # forward read | |
| 67 subseq_f = i[coords.start1:coords.end1] | |
| 68 subseq_f.id = "{}_{}_{}_f".format(index, coords.start1, | |
| 69 coords.end1) | |
| 70 subseq_f.description = "" | |
| 71 # reverse complement read | |
| 72 subseq_r = i[coords.start2:coords.end2].reverse_complement() | |
| 73 subseq_r.id = "{}_{}_{}_r".format(index, coords.start1, | |
| 74 coords.end1) | |
| 75 subseq_r.description = "" | |
| 76 # return only if sequences are long enough | |
| 77 if len(subseq_r) == args.read_length: | |
| 78 yield subseq_f | |
| 79 yield subseq_r | |
| 80 coords = next(pos) | |
| 81 else: | |
| 82 break | |
| 83 | |
| 84 | |
| 85 def long2short(args): | |
| 86 '''Sample short reads from long sequences | |
| 87 args contain these attributes:: | |
| 88 ------------ | |
| 89 input_file - path to file in fasta format | |
| 90 output_file - path to output file, fasta format | |
| 91 options - options is named tuple and specifies read length | |
| 92 coverage, insert length, max number of sequences which will be return | |
| 93 | |
| 94 ''' | |
| 95 summary = get_sequences_summary(args.input.name) | |
| 96 with open(args.output.name, 'w') as f: | |
| 97 for i in extract_short_reads(summary, args): | |
| 98 SeqIO.write(i, f, SEQ_FORMAT) | |
| 99 | |
| 100 | |
| 101 def get_args(): | |
| 102 '''Parses command line arguments ''' | |
| 103 description = "Creates pseudo short reads from long oxford nanopore reads" | |
| 104 parser = argparse.ArgumentParser( | |
| 105 description=description, | |
| 106 formatter_class=ArgumentDefaultsHelpFormatter) | |
| 107 parser.add_argument('-i', | |
| 108 '--input', | |
| 109 type=argparse.FileType('r'), | |
| 110 help="file with long reads in fasta format") | |
| 111 parser.add_argument('-o', | |
| 112 '--output', | |
| 113 type=argparse.FileType('w'), | |
| 114 help="Output file name") | |
| 115 parser.add_argument("-cov", | |
| 116 "--coverage", | |
| 117 type=float, | |
| 118 default=DEFAULT_COVERAGE, | |
| 119 help="samplig coverage") | |
| 120 parser.add_argument( | |
| 121 "-L", | |
| 122 "--insert_length", | |
| 123 type=int, | |
| 124 default=DEFAULT_INSERT_LENGTH, | |
| 125 help="length of insert, must be longer than read length") | |
| 126 parser.add_argument("-l", | |
| 127 "--read_length", | |
| 128 type=int, | |
| 129 default=DEFAULT_READ_LENGTH, | |
| 130 help="read length") | |
| 131 | |
| 132 args = parser.parse_args() | |
| 133 if len(sys.argv) == 1: | |
| 134 parser.print_help() | |
| 135 sys.exit(1) | |
| 136 | |
| 137 #hassert args.insert_length > args.read_length, "read length must be shorter than insert length" | |
| 138 return args | |
| 139 | |
| 140 | |
| 141 def main(): | |
| 142 '''Sample short reads from long sequences | |
| 143 Files path are passed as command line positional arguments | |
| 144 ''' | |
| 145 args = get_args() | |
| 146 long2short(args) | |
| 147 | |
| 148 | |
| 149 if __name__ == "__main__": | |
| 150 main() |
