annotate UMI_riboseq_tool/UMI.py @ 1:4605a1af1f3d draft default tip

Uploaded
author triasteran
date Fri, 30 Sep 2022 09:07:04 +0000
parents 393b73b2eb5f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
1 import gzip
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
2 from sys import argv, exit
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
3 from itertools import zip_longest
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
4 from Bio import SeqIO
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
5
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
6 def grouper(iterable, n, fillvalue=None):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
7 args = [iter(iterable)] * n
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
8 return zip_longest(*args, fillvalue=fillvalue)
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
9
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
10
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
11 def is_gz_file(filepath):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
12 with open(filepath, 'rb') as test_f:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
13 return test_f.read(2) == b'\x1f\x8b'
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
14
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
15
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
16 def process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
17 '''
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
18 Write UMI to FASTQ header for given biopython record to given open output
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
19 UMI_5_prime_length number of bases to trim from 5' and write to header
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
20 UMI_3_prime_length number of bases to trim from 3' and write to header
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
21 defaults are for McGlincy Ingolia Protocol
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
22 '''
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
23 lines = record.format('fastq').split('\n') # all 4 lines
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
24 header = lines[0]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
25 seq = lines[1]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
26 sep = lines[2]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
27 qual = lines[3]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
28
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
29 if (header.startswith('@')):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
30 trimmed_seq = seq[UMI_5_prime_length:-(UMI_3_prime_length + 1)] # fooprint + barcode
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
31 UMI = seq[0:UMI_5_prime_length]+seq.rstrip()[-UMI_3_prime_length:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
32 split_header = header.split(" ")
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
33 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
34 new_qual = qual[UMI_5_prime_length:-(UMI_3_prime_length + 1)]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
35 output.write(new_header+'\n')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
36 output.write(trimmed_seq+'\n')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
37 output.write(sep+'\n')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
38 output.write(new_qual+'\n')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
39
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
40
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
41
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
42 def UMI_processing(pathToFastaFile, output_path, bool_gzip, UMI_5_prime_length, UMI_3_prime_length):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
43
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
44 if bool_gzip:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
45 output = gzip.open(output_path,"wt")
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
46 else:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
47 output = open(output_path, 'w')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
48
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
49 if is_gz_file(pathToFastaFile) == True:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
50 print ('file is gzipped fastq')
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
51 with gzip.open(pathToFastaFile, "rt") as handle:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
52 for i, record in enumerate(SeqIO.parse(handle, "fastq")):
1
4605a1af1f3d Uploaded
triasteran
parents: 0
diff changeset
53 process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length)
0
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
54
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
55 else:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
56 for record in SeqIO.parse(pathToFastaFile, 'fastq'):
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
57 process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length)
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
58
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
59 output.close()
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
60
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
61
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
62
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
63
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
64 def main():
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
65 if len(argv) != 6:
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
66 exit("Usage: 3 arguments required\n1: Path to fasta file \n2: name of output file\n3: string 'True' or 'False' whether to gzip output")
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
67
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
68 # Get paths
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
69 pathToFastaFile = argv[1]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
70 output = argv[2]
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
71 bool_gzip = True if argv[3].lower().capitalize() == 'True' else False
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
72 UMI_5_prime_length = int(argv[4])
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
73 UMI_3_prime_length = int(argv[5])
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
74 UMI_processing(pathToFastaFile, output, bool_gzip, UMI_5_prime_length, UMI_3_prime_length)
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
75
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
76 if __name__ == "__main__":
393b73b2eb5f Uploaded
jackcurragh
parents:
diff changeset
77 main()