4
|
1 import gzip
|
|
2 from mimetypes import guess_type
|
|
3 from functools import partial
|
|
4 from Bio import SeqIO
|
0
|
5 from sys import argv, exit
|
|
6
|
4
|
7 def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
|
|
8 # find wheather its plain or gzipped fastq
|
|
9 encoding = guess_type(pathToFastaFile)[1] # uses file extension
|
|
10 _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
|
|
11 # output file will be in gz format
|
|
12 output = gzip.open(output,"wt")
|
|
13 # open and parse
|
|
14 with _open(pathToFastaFile) as f:
|
|
15 for record in SeqIO.parse(f, 'fastq'):
|
|
16 lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
|
0
|
17 header = lines[0]
|
|
18 seq = lines[1]
|
|
19 sep = lines[2]
|
|
20 qual = lines[3]
|
3
|
21 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
|
4
|
22 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
|
0
|
23 split_header = header.split(" ")
|
|
24 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
|
|
25 if qual[-1:] == "\n":
|
3
|
26 new_qual = qual[2:-6]+"\n"
|
0
|
27 else:
|
3
|
28 new_qual = qual[2:-6]
|
4
|
29 output.write(new_header+'\n')
|
|
30 output.write(trimmed_seq)
|
|
31 output.write(sep+'\n')
|
|
32 output.write(new_qual+'\n')
|
0
|
33
|
4
|
34 output.close()
|
0
|
35
|
|
36 def main():
|
4
|
37 if len(argv) != 3:
|
0
|
38 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
|
|
39
|
|
40 # Get paths
|
|
41 pathToFastaFile = argv[1]
|
|
42 output = argv[2]
|
4
|
43 copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
|
0
|
44
|
|
45 if __name__ == "__main__":
|
|
46 main()
|