| 
5
 | 
     1 import gzip
 | 
| 
 | 
     2 from mimetypes import guess_type
 | 
| 
 | 
     3 from functools import partial
 | 
| 
 | 
     4 from sys import argv, exit
 | 
| 
 | 
     5 import itertools
 | 
| 
 | 
     6 from itertools import zip_longest
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 def grouper(iterable, n, fillvalue=None):
 | 
| 
 | 
    10     args = [iter(iterable)] * n
 | 
| 
 | 
    11     return zip_longest(*args, fillvalue=fillvalue)
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 chunk_size=4
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
 | 
| 
 | 
    17     # find wheather its plain or gzipped fastq
 | 
| 
 | 
    18     encoding = guess_type(pathToFastaFile)[1]  # uses file extension
 | 
| 
 | 
    19     _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
 | 
| 
 | 
    20     # output file will be in gz format
 | 
| 
 | 
    21     output = gzip.open(output,"wt")
 | 
| 
 | 
    22     # open and parse
 | 
| 
 | 
    23     with _open(pathToFastaFile) as f:
 | 
| 
 | 
    24         for lines in grouper(f, chunk_size, ""):
 | 
| 
 | 
    25             #lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
 | 
| 
 | 
    26             header = lines[0]
 | 
| 
 | 
    27             seq = lines[1]
 | 
| 
 | 
    28             sep = lines[2]
 | 
| 
 | 
    29             qual = lines[3]
 | 
| 
 | 
    30             trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
 | 
| 
 | 
    31             UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
 | 
| 
 | 
    32             split_header = header.split(" ")
 | 
| 
 | 
    33             new_header = split_header[0]+"_"+UMI+" "+split_header[1]
 | 
| 
 | 
    34             if qual[-1:] == "\n":
 | 
| 
 | 
    35                 new_qual = qual[2:-6]+"\n"
 | 
| 
 | 
    36             else:
 | 
| 
 | 
    37                 new_qual = qual[2:-6]
 | 
| 
 | 
    38             output.write(new_header)
 | 
| 
 | 
    39             output.write(trimmed_seq)
 | 
| 
 | 
    40             output.write(sep)
 | 
| 
 | 
    41             output.write(new_qual)
 | 
| 
 | 
    42 
 | 
| 
 | 
    43     output.close()
 | 
| 
 | 
    44 
 | 
| 
 | 
    45     
 | 
| 
 | 
    46 def main():
 | 
| 
 | 
    47     if len(argv) != 3:
 | 
| 
 | 
    48         exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
 | 
| 
 | 
    49 
 | 
| 
 | 
    50     # Get paths
 | 
| 
 | 
    51     pathToFastaFile = argv[1]
 | 
| 
 | 
    52     output = argv[2]
 | 
| 
 | 
    53     copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 if __name__ == "__main__":
 | 
| 
 | 
    56     main()
 |