ribogalaxy_umi_processing: UMI_riboseq_processing/UMI.py comparison

comparison UMI_riboseq_processing/UMI.py @ 4:a580e700aac3 draft

Uploaded

author	triasteran
date	Tue, 21 Jun 2022 08:32:44 +0000
parents	d27375bc4a1c
children

comparison

equal deleted inserted replaced

-:d27375bc4a1c
+:a580e700aac3
-import itertools
+import gzip
+from mimetypes import guess_type
+from functools import partial
+from Bio import SeqIO
 from sys import argv, exit
-from itertools import zip_longest
-def grouper(iterable, n, fillvalue=None):
+def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
-args = [iter(iterable)] * n
+# find wheather its plain or gzipped fastq
-return zip_longest(*args, fillvalue=fillvalue)
+encoding = guess_type(pathToFastaFile)[1]  # uses file extension
+_open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
+# output file will be in gz format
-chunk_size=4
+output = gzip.open(output,"wt")
+# open and parse
+with _open(pathToFastaFile) as f:
-def trimandpaste(pathToFastaFile, output):
+for record in SeqIO.parse(f, 'fastq'):
-#filename = pathToFastaFile.split('/')[-1]
+lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
-output = open(output,"w")
-with open(pathToFastaFile) as f:
-for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk
 header = lines[0]
 seq = lines[1]
 sep = lines[2]
 qual = lines[3]
 trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
 UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
 split_header = header.split(" ")
 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
 if qual[-1:] == "\n":
 new_qual = qual[2:-6]+"\n"
 else:
 new_qual = qual[2:-6]
-output.write(new_header)
+output.write(new_header+'\n')
 output.write(trimmed_seq)
-output.write(sep)
+output.write(sep+'\n')
-output.write(new_qual)
+output.write(new_qual+'\n')
 output.close()
 def main():
 if len(argv) != 3:
 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
 # Get paths
 pathToFastaFile = argv[1]
 output = argv[2]
+copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
-trimandpaste(pathToFastaFile, output)
 if __name__ == "__main__":
 main()

Mercurial > repos > triasteran > ribogalaxy_umi_processing

comparison UMI_riboseq_processing/UMI.py @ 4:a580e700aac3 draft