6
|
1 import gzip
|
|
2 from mimetypes import guess_type
|
|
3 from functools import partial
|
|
4 from sys import argv, exit
|
|
5 import itertools
|
|
6 from itertools import zip_longest
|
8
|
7 import subprocess
|
|
8 from subprocess import call
|
9
|
9 import Bio
|
|
10 from Bio import SeqIO
|
6
|
11
|
|
12 def grouper(iterable, n, fillvalue=None):
|
|
13 args = [iter(iterable)] * n
|
|
14 return zip_longest(*args, fillvalue=fillvalue)
|
|
15
|
|
16
|
8
|
17 def is_gz_file(filepath):
|
|
18 with open(filepath, 'rb') as test_f:
|
|
19 return test_f.read(2) == b'\x1f\x8b'
|
|
20
|
6
|
21
|
9
|
22 def UMI_processing(pathToFastaFile, output_path):
|
|
23
|
8
|
24 output = open(output_path,"w")
|
9
|
25
|
|
26 if is_gz_file(pathToFastaFile) == True:
|
|
27 print ('file is gzipped fastq')
|
|
28 with gzip.open(pathToFastaFile, "rt") as handle:
|
|
29 for i, record in enumerate(SeqIO.parse(handle, "fastq")):
|
|
30 lines = record.format('fastq').split('\n') # all 4 lines
|
|
31 header = lines[0]
|
|
32 if i % 100000 == 0:
|
|
33 print ('read number %s' % i)
|
|
34 seq = lines[1]
|
|
35 sep = lines[2]
|
|
36 qual = lines[3]
|
|
37 if (header.startswith('@')):
|
|
38 trimmed_seq = seq[2:-6] # fooprint + barcode
|
|
39 UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
|
|
40 split_header = header.split(" ")
|
|
41 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
|
|
42 new_qual = qual[2:-6]
|
|
43 output.write(new_header+'\n')
|
|
44 output.write(trimmed_seq+'\n')
|
|
45 output.write(sep+'\n')
|
|
46 output.write(new_qual+'\n')
|
|
47
|
|
48 else:
|
|
49 for record in SeqIO.parse(pathToFastaFile, 'fastq'):
|
|
50 lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
|
|
51 header = lines[0]
|
|
52 seq = lines[1]
|
|
53 sep = lines[2]
|
|
54 qual = lines[3]
|
|
55 trimmed_seq = seq[2:-6] # fooprint + barcode
|
|
56 UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
|
8
|
57 split_header = header.split(" ")
|
|
58 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
|
9
|
59 new_qual = qual[2:-6]
|
|
60 output.write(new_header+'\n')
|
|
61 output.write(trimmed_seq+'\n')
|
|
62 output.write(sep+'\n')
|
|
63 output.write(new_qual+'\n')
|
|
64
|
6
|
65 output.close()
|
9
|
66
|
8
|
67
|
6
|
68
|
8
|
69
|
6
|
70 def main():
|
|
71 if len(argv) != 3:
|
|
72 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
|
|
73
|
|
74 # Get paths
|
|
75 pathToFastaFile = argv[1]
|
|
76 output = argv[2]
|
8
|
77 UMI_processing(pathToFastaFile, output)
|
6
|
78
|
|
79 if __name__ == "__main__":
|
|
80 main()
|