annotate UMI_riboseq_processing/UMI.py @ 4:a580e700aac3 draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 08:32:44 +0000
parents d27375bc4a1c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
1 import gzip
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
2 from mimetypes import guess_type
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
3 from functools import partial
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
4 from Bio import SeqIO
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
5 from sys import argv, exit
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
6
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
7 def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
8 # find wheather its plain or gzipped fastq
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
9 encoding = guess_type(pathToFastaFile)[1] # uses file extension
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
10 _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
11 # output file will be in gz format
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
12 output = gzip.open(output,"wt")
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
13 # open and parse
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
14 with _open(pathToFastaFile) as f:
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
15 for record in SeqIO.parse(f, 'fastq'):
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
16 lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
17 header = lines[0]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
18 seq = lines[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
19 sep = lines[2]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
20 qual = lines[3]
3
d27375bc4a1c Uploaded
triasteran
parents: 2
diff changeset
21 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
22 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
23 split_header = header.split(" ")
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
24 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
25 if qual[-1:] == "\n":
3
d27375bc4a1c Uploaded
triasteran
parents: 2
diff changeset
26 new_qual = qual[2:-6]+"\n"
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
27 else:
3
d27375bc4a1c Uploaded
triasteran
parents: 2
diff changeset
28 new_qual = qual[2:-6]
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
29 output.write(new_header+'\n')
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
30 output.write(trimmed_seq)
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
31 output.write(sep+'\n')
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
32 output.write(new_qual+'\n')
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
33
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
34 output.close()
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
35
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
36 def main():
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
37 if len(argv) != 3:
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
38 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
39
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
40 # Get paths
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
41 pathToFastaFile = argv[1]
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
42 output = argv[2]
4
a580e700aac3 Uploaded
triasteran
parents: 3
diff changeset
43 copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
0
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
44
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
45 if __name__ == "__main__":
ef98c6fad2a2 Uploaded
triasteran
parents:
diff changeset
46 main()