annotate UMI_riboseq_processing/UMI.py @ 9:31438c26afec draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 14:41:24 +0000
parents 701804f5ad4b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
1 import gzip
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
2 from mimetypes import guess_type
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
3 from functools import partial
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
4 from sys import argv, exit
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
5 import itertools
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
6 from itertools import zip_longest
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
7 import subprocess
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
8 from subprocess import call
9
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
9 import Bio
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
10 from Bio import SeqIO
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
11
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
12 def grouper(iterable, n, fillvalue=None):
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
13 args = [iter(iterable)] * n
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
14 return zip_longest(*args, fillvalue=fillvalue)
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
15
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
16
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
17 def is_gz_file(filepath):
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
18 with open(filepath, 'rb') as test_f:
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
19 return test_f.read(2) == b'\x1f\x8b'
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
20
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
21
9
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
22 def UMI_processing(pathToFastaFile, output_path):
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
23
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
24 output = open(output_path,"w")
9
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
25
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
26 if is_gz_file(pathToFastaFile) == True:
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
27 print ('file is gzipped fastq')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
28 with gzip.open(pathToFastaFile, "rt") as handle:
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
29 for i, record in enumerate(SeqIO.parse(handle, "fastq")):
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
30 lines = record.format('fastq').split('\n') # all 4 lines
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
31 header = lines[0]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
32 if i % 100000 == 0:
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
33 print ('read number %s' % i)
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
34 seq = lines[1]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
35 sep = lines[2]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
36 qual = lines[3]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
37 if (header.startswith('@')):
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
38 trimmed_seq = seq[2:-6] # fooprint + barcode
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
39 UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
40 split_header = header.split(" ")
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
41 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
42 new_qual = qual[2:-6]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
43 output.write(new_header+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
44 output.write(trimmed_seq+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
45 output.write(sep+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
46 output.write(new_qual+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
47
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
48 else:
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
49 for record in SeqIO.parse(pathToFastaFile, 'fastq'):
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
50 lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
51 header = lines[0]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
52 seq = lines[1]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
53 sep = lines[2]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
54 qual = lines[3]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
55 trimmed_seq = seq[2:-6] # fooprint + barcode
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
56 UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
57 split_header = header.split(" ")
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
58 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
9
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
59 new_qual = qual[2:-6]
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
60 output.write(new_header+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
61 output.write(trimmed_seq+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
62 output.write(sep+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
63 output.write(new_qual+'\n')
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
64
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
65 output.close()
9
31438c26afec Uploaded
triasteran
parents: 8
diff changeset
66
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
67
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
68
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
69
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
70 def main():
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
71 if len(argv) != 3:
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
72 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
73
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
74 # Get paths
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
75 pathToFastaFile = argv[1]
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
76 output = argv[2]
8
701804f5ad4b Uploaded
triasteran
parents: 7
diff changeset
77 UMI_processing(pathToFastaFile, output)
6
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
78
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
79 if __name__ == "__main__":
1ce4b52212c4 Uploaded
triasteran
parents:
diff changeset
80 main()