| 
0
 | 
     1 #!/usr/bin/env python3
 | 
| 
 | 
     2 import argparse
 | 
| 
 | 
     3 import sys
 | 
| 
 | 
     4 from argparse import ArgumentDefaultsHelpFormatter
 | 
| 
 | 
     5 from collections import namedtuple
 | 
| 
 | 
     6 from collections import OrderedDict
 | 
| 
 | 
     7 from Bio import SeqIO
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 SEQ_FORMAT = "fasta"
 | 
| 
 | 
    10 # Default sampling, used on argparse:
 | 
| 
 | 
    11 DEFAULT_READ_LENGTH = 200
 | 
| 
 | 
    12 DEFAULT_INSERT_LENGTH = 700
 | 
| 
 | 
    13 DEFAULT_COVERAGE = 0.1
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 Sequences_summary = namedtuple('Fasta_summary',
 | 
| 
 | 
    16                                ['total_length', 'number_of_sequence',
 | 
| 
 | 
    17                                 'id_length', 'file_path', 'format'])
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 Coordinates = namedtuple('Coordinates', "id start1 end1 start2 end2")
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 def get_sequences_summary(seq_file):
 | 
| 
 | 
    23     ''' return basic characteristic of sequences '''
 | 
| 
 | 
    24     id_length = OrderedDict()
 | 
| 
 | 
    25     totat_length = 0
 | 
| 
 | 
    26     N = 0
 | 
| 
 | 
    27     for seqs in SeqIO.parse(seq_file, SEQ_FORMAT):
 | 
| 
 | 
    28         id_length[seqs.id] = len(seqs)
 | 
| 
 | 
    29         totat_length += len(seqs)
 | 
| 
 | 
    30         N += 1
 | 
| 
 | 
    31     return Sequences_summary(totat_length, N, id_length, seq_file, SEQ_FORMAT)
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 def get_short_pseudoreads_position(fasta_summary, sampling_options):
 | 
| 
 | 
    35     """Return selected position on long read sequences
 | 
| 
 | 
    36     Arguments:
 | 
| 
 | 
    37     fasta_summary - namedtuple Fasta_summaty containing information about sequences
 | 
| 
 | 
    38     sampling options - namedtuple, specified how sequences should be sampled
 | 
| 
 | 
    39     Return value:
 | 
| 
 | 
    40     (sequence_id, start1, end1, start2, end2)
 | 
| 
 | 
    41     """
 | 
| 
 | 
    42     interval = int(2 * sampling_options.read_length /
 | 
| 
 | 
    43                    sampling_options.coverage)
 | 
| 
 | 
    44     for seqname, length in fasta_summary.id_length.items():
 | 
| 
 | 
    45         start_positions = range(1, length, interval)
 | 
| 
 | 
    46         for s in start_positions:
 | 
| 
 | 
    47             yield Coordinates(seqname, s, s + sampling_options.read_length,
 | 
| 
 | 
    48                               s + sampling_options.insert_length -
 | 
| 
 | 
    49                               sampling_options.read_length,
 | 
| 
 | 
    50                               s + sampling_options.insert_length)
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 def extract_short_reads(summary, args):
 | 
| 
 | 
    54     '''yield short reades sampled from long reads
 | 
| 
 | 
    55     Arguments:
 | 
| 
 | 
    56     summary.. named tuple specifie sequences properties, path, length, idslist
 | 
| 
 | 
    57     args ..... Define how short sequences should be generated
 | 
| 
 | 
    58     '''
 | 
| 
 | 
    59     pos = get_short_pseudoreads_position(summary, args)
 | 
| 
 | 
    60     coords = next(pos)
 | 
| 
 | 
    61     index = 0
 | 
| 
 | 
    62     for i in SeqIO.parse(summary.file_path, summary.format):
 | 
| 
 | 
    63         index += 1
 | 
| 
 | 
    64         while True:
 | 
| 
 | 
    65             if coords.id == i.id:
 | 
| 
 | 
    66                 # forward read
 | 
| 
 | 
    67                 subseq_f = i[coords.start1:coords.end1]
 | 
| 
 | 
    68                 subseq_f.id = "{}_{}_{}_f".format(index, coords.start1,
 | 
| 
 | 
    69                                                   coords.end1)
 | 
| 
 | 
    70                 subseq_f.description = ""
 | 
| 
 | 
    71                 # reverse complement read
 | 
| 
 | 
    72                 subseq_r = i[coords.start2:coords.end2].reverse_complement()
 | 
| 
 | 
    73                 subseq_r.id = "{}_{}_{}_r".format(index, coords.start1,
 | 
| 
 | 
    74                                                   coords.end1)
 | 
| 
 | 
    75                 subseq_r.description = ""
 | 
| 
 | 
    76                 # return only if sequences are long enough
 | 
| 
 | 
    77                 if len(subseq_r) == args.read_length:
 | 
| 
 | 
    78                     yield subseq_f
 | 
| 
 | 
    79                     yield subseq_r
 | 
| 
 | 
    80                 coords = next(pos)
 | 
| 
 | 
    81             else:
 | 
| 
 | 
    82                 break
 | 
| 
 | 
    83 
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 def long2short(args):
 | 
| 
 | 
    86     '''Sample short reads from long sequences
 | 
| 
 | 
    87     args contain these attributes::
 | 
| 
 | 
    88     ------------
 | 
| 
 | 
    89     input_file   - path to file in fasta format
 | 
| 
 | 
    90     output_file  - path to output file, fasta format
 | 
| 
 | 
    91     options      - options is named tuple and specifies read length
 | 
| 
 | 
    92                    coverage, insert length, max number of sequences which will be return
 | 
| 
 | 
    93 
 | 
| 
 | 
    94     '''
 | 
| 
 | 
    95     summary = get_sequences_summary(args.input.name)
 | 
| 
 | 
    96     with open(args.output.name, 'w') as f:
 | 
| 
 | 
    97         for i in extract_short_reads(summary, args):
 | 
| 
 | 
    98             SeqIO.write(i, f, SEQ_FORMAT)
 | 
| 
 | 
    99 
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 def get_args():
 | 
| 
 | 
   102     '''Parses command line arguments '''
 | 
| 
 | 
   103     description = "Creates pseudo short reads from long oxford nanopore reads"
 | 
| 
 | 
   104     parser = argparse.ArgumentParser(
 | 
| 
 | 
   105         description=description,
 | 
| 
 | 
   106         formatter_class=ArgumentDefaultsHelpFormatter)
 | 
| 
 | 
   107     parser.add_argument('-i',
 | 
| 
 | 
   108                         '--input',
 | 
| 
 | 
   109                         type=argparse.FileType('r'),
 | 
| 
 | 
   110                         help="file with long reads in fasta format")
 | 
| 
 | 
   111     parser.add_argument('-o',
 | 
| 
 | 
   112                         '--output',
 | 
| 
 | 
   113                         type=argparse.FileType('w'),
 | 
| 
 | 
   114                         help="Output file name")
 | 
| 
 | 
   115     parser.add_argument("-cov",
 | 
| 
 | 
   116                         "--coverage",
 | 
| 
 | 
   117                         type=float,
 | 
| 
 | 
   118                         default=DEFAULT_COVERAGE,
 | 
| 
 | 
   119                         help="samplig coverage")
 | 
| 
 | 
   120     parser.add_argument(
 | 
| 
 | 
   121         "-L",
 | 
| 
 | 
   122         "--insert_length",
 | 
| 
 | 
   123         type=int,
 | 
| 
 | 
   124         default=DEFAULT_INSERT_LENGTH,
 | 
| 
 | 
   125         help="length of insert, must be longer than read length")
 | 
| 
 | 
   126     parser.add_argument("-l",
 | 
| 
 | 
   127                         "--read_length",
 | 
| 
 | 
   128                         type=int,
 | 
| 
 | 
   129                         default=DEFAULT_READ_LENGTH,
 | 
| 
 | 
   130                         help="read length")
 | 
| 
 | 
   131 
 | 
| 
 | 
   132     args = parser.parse_args()
 | 
| 
 | 
   133     if len(sys.argv) == 1:
 | 
| 
 | 
   134         parser.print_help()
 | 
| 
 | 
   135         sys.exit(1)
 | 
| 
 | 
   136 
 | 
| 
 | 
   137     #hassert args.insert_length > args.read_length, "read length must be shorter than insert length"
 | 
| 
 | 
   138     return args
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 def main():
 | 
| 
 | 
   142     '''Sample short reads from long sequences
 | 
| 
 | 
   143     Files path are passed as command line positional arguments
 | 
| 
 | 
   144                         '''
 | 
| 
 | 
   145     args = get_args()
 | 
| 
 | 
   146     long2short(args)
 | 
| 
 | 
   147 
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 if __name__ == "__main__":
 | 
| 
 | 
   150     main()
 |