| 0 | 1 #!/usr/bin/env python3 | 
|  | 2 import argparse | 
|  | 3 import sys | 
|  | 4 from argparse import ArgumentDefaultsHelpFormatter | 
|  | 5 from collections import namedtuple | 
|  | 6 from collections import OrderedDict | 
|  | 7 from Bio import SeqIO | 
|  | 8 | 
|  | 9 SEQ_FORMAT = "fasta" | 
|  | 10 # Default sampling, used on argparse: | 
|  | 11 DEFAULT_READ_LENGTH = 200 | 
|  | 12 DEFAULT_INSERT_LENGTH = 700 | 
|  | 13 DEFAULT_COVERAGE = 0.1 | 
|  | 14 | 
|  | 15 Sequences_summary = namedtuple('Fasta_summary', | 
|  | 16                                ['total_length', 'number_of_sequence', | 
|  | 17                                 'id_length', 'file_path', 'format']) | 
|  | 18 | 
|  | 19 Coordinates = namedtuple('Coordinates', "id start1 end1 start2 end2") | 
|  | 20 | 
|  | 21 | 
|  | 22 def get_sequences_summary(seq_file): | 
|  | 23     ''' return basic characteristic of sequences ''' | 
|  | 24     id_length = OrderedDict() | 
|  | 25     totat_length = 0 | 
|  | 26     N = 0 | 
|  | 27     for seqs in SeqIO.parse(seq_file, SEQ_FORMAT): | 
|  | 28         id_length[seqs.id] = len(seqs) | 
|  | 29         totat_length += len(seqs) | 
|  | 30         N += 1 | 
|  | 31     return Sequences_summary(totat_length, N, id_length, seq_file, SEQ_FORMAT) | 
|  | 32 | 
|  | 33 | 
|  | 34 def get_short_pseudoreads_position(fasta_summary, sampling_options): | 
|  | 35     """Return selected position on long read sequences | 
|  | 36     Arguments: | 
|  | 37     fasta_summary - namedtuple Fasta_summaty containing information about sequences | 
|  | 38     sampling options - namedtuple, specified how sequences should be sampled | 
|  | 39     Return value: | 
|  | 40     (sequence_id, start1, end1, start2, end2) | 
|  | 41     """ | 
|  | 42     interval = int(2 * sampling_options.read_length / | 
|  | 43                    sampling_options.coverage) | 
|  | 44     for seqname, length in fasta_summary.id_length.items(): | 
|  | 45         start_positions = range(1, length, interval) | 
|  | 46         for s in start_positions: | 
|  | 47             yield Coordinates(seqname, s, s + sampling_options.read_length, | 
|  | 48                               s + sampling_options.insert_length - | 
|  | 49                               sampling_options.read_length, | 
|  | 50                               s + sampling_options.insert_length) | 
|  | 51 | 
|  | 52 | 
|  | 53 def extract_short_reads(summary, args): | 
|  | 54     '''yield short reades sampled from long reads | 
|  | 55     Arguments: | 
|  | 56     summary.. named tuple specifie sequences properties, path, length, idslist | 
|  | 57     args ..... Define how short sequences should be generated | 
|  | 58     ''' | 
|  | 59     pos = get_short_pseudoreads_position(summary, args) | 
|  | 60     coords = next(pos) | 
|  | 61     index = 0 | 
|  | 62     for i in SeqIO.parse(summary.file_path, summary.format): | 
|  | 63         index += 1 | 
|  | 64         while True: | 
|  | 65             if coords.id == i.id: | 
|  | 66                 # forward read | 
|  | 67                 subseq_f = i[coords.start1:coords.end1] | 
|  | 68                 subseq_f.id = "{}_{}_{}_f".format(index, coords.start1, | 
|  | 69                                                   coords.end1) | 
|  | 70                 subseq_f.description = "" | 
|  | 71                 # reverse complement read | 
|  | 72                 subseq_r = i[coords.start2:coords.end2].reverse_complement() | 
|  | 73                 subseq_r.id = "{}_{}_{}_r".format(index, coords.start1, | 
|  | 74                                                   coords.end1) | 
|  | 75                 subseq_r.description = "" | 
|  | 76                 # return only if sequences are long enough | 
|  | 77                 if len(subseq_r) == args.read_length: | 
|  | 78                     yield subseq_f | 
|  | 79                     yield subseq_r | 
|  | 80                 coords = next(pos) | 
|  | 81             else: | 
|  | 82                 break | 
|  | 83 | 
|  | 84 | 
|  | 85 def long2short(args): | 
|  | 86     '''Sample short reads from long sequences | 
|  | 87     args contain these attributes:: | 
|  | 88     ------------ | 
|  | 89     input_file   - path to file in fasta format | 
|  | 90     output_file  - path to output file, fasta format | 
|  | 91     options      - options is named tuple and specifies read length | 
|  | 92                    coverage, insert length, max number of sequences which will be return | 
|  | 93 | 
|  | 94     ''' | 
|  | 95     summary = get_sequences_summary(args.input.name) | 
|  | 96     with open(args.output.name, 'w') as f: | 
|  | 97         for i in extract_short_reads(summary, args): | 
|  | 98             SeqIO.write(i, f, SEQ_FORMAT) | 
|  | 99 | 
|  | 100 | 
|  | 101 def get_args(): | 
|  | 102     '''Parses command line arguments ''' | 
|  | 103     description = "Creates pseudo short reads from long oxford nanopore reads" | 
|  | 104     parser = argparse.ArgumentParser( | 
|  | 105         description=description, | 
|  | 106         formatter_class=ArgumentDefaultsHelpFormatter) | 
|  | 107     parser.add_argument('-i', | 
|  | 108                         '--input', | 
|  | 109                         type=argparse.FileType('r'), | 
|  | 110                         help="file with long reads in fasta format") | 
|  | 111     parser.add_argument('-o', | 
|  | 112                         '--output', | 
|  | 113                         type=argparse.FileType('w'), | 
|  | 114                         help="Output file name") | 
|  | 115     parser.add_argument("-cov", | 
|  | 116                         "--coverage", | 
|  | 117                         type=float, | 
|  | 118                         default=DEFAULT_COVERAGE, | 
|  | 119                         help="samplig coverage") | 
|  | 120     parser.add_argument( | 
|  | 121         "-L", | 
|  | 122         "--insert_length", | 
|  | 123         type=int, | 
|  | 124         default=DEFAULT_INSERT_LENGTH, | 
|  | 125         help="length of insert, must be longer than read length") | 
|  | 126     parser.add_argument("-l", | 
|  | 127                         "--read_length", | 
|  | 128                         type=int, | 
|  | 129                         default=DEFAULT_READ_LENGTH, | 
|  | 130                         help="read length") | 
|  | 131 | 
|  | 132     args = parser.parse_args() | 
|  | 133     if len(sys.argv) == 1: | 
|  | 134         parser.print_help() | 
|  | 135         sys.exit(1) | 
|  | 136 | 
|  | 137     #hassert args.insert_length > args.read_length, "read length must be shorter than insert length" | 
|  | 138     return args | 
|  | 139 | 
|  | 140 | 
|  | 141 def main(): | 
|  | 142     '''Sample short reads from long sequences | 
|  | 143     Files path are passed as command line positional arguments | 
|  | 144                         ''' | 
|  | 145     args = get_args() | 
|  | 146     long2short(args) | 
|  | 147 | 
|  | 148 | 
|  | 149 if __name__ == "__main__": | 
|  | 150     main() |