ribogalaxy_umi_processing: UMI_riboseq_processing/UMI.py comparison

comparison UMI_riboseq_processing/UMI.py @ 9:31438c26afec draft

Uploaded

author	triasteran
date	Tue, 21 Jun 2022 14:41:24 +0000
parents	701804f5ad4b
children

comparison

equal deleted inserted replaced

-:701804f5ad4b
+:31438c26afec
 from sys import argv, exit
 import itertools
 from itertools import zip_longest
 import subprocess
 from subprocess import call
+import Bio
+from Bio import SeqIO
 def grouper(iterable, n, fillvalue=None):
 args = [iter(iterable)] * n
 return zip_longest(*args, fillvalue=fillvalue)
 def is_gz_file(filepath):
 with open(filepath, 'rb') as test_f:
 return test_f.read(2) == b'\x1f\x8b'
-def lines_parse(f, output_path):
+def UMI_processing(pathToFastaFile, output_path):
 output = open(output_path,"w")
-for lines in grouper(f, 4, "\n"):
-header = lines[0]
+if is_gz_file(pathToFastaFile) == True:
-#print (header)
+print ('file is gzipped fastq')
-seq = lines[1]
+with gzip.open(pathToFastaFile, "rt") as handle:
-sep = lines[2]
+for i, record in enumerate(SeqIO.parse(handle, "fastq")):
-qual = lines[3]
+lines = record.format('fastq').split('\n') # all 4 lines
-# check if  header is OK
+header = lines[0]
-if (header.startswith('@')):
+if i % 100000 == 0:
-trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
+print ('read number %s' % i)
-UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
+seq = lines[1]
+sep = lines[2]
+qual = lines[3]
+if (header.startswith('@')):
+trimmed_seq = seq[2:-6] # fooprint + barcode
+UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
+split_header = header.split(" ")
+new_header = split_header[0]+"_"+UMI+" "+split_header[1]
+new_qual = qual[2:-6]
+output.write(new_header+'\n')
+output.write(trimmed_seq+'\n')
+output.write(sep+'\n')
+output.write(new_qual+'\n')
+else:
+for record in SeqIO.parse(pathToFastaFile, 'fastq'):
+lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
+header = lines[0]
+seq = lines[1]
+sep = lines[2]
+qual = lines[3]
+trimmed_seq = seq[2:-6] # fooprint + barcode
+UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
 split_header = header.split(" ")
 new_header = split_header[0]+"_"+UMI+" "+split_header[1]
-if qual[-1:] == "\n":
+new_qual = qual[2:-6]
-new_qual = qual[2:-6]+"\n"
+output.write(new_header+'\n')
-else:
+output.write(trimmed_seq+'\n')
-new_qual = qual[2:-6]
+output.write(sep+'\n')
-output.write(new_header)
+output.write(new_qual+'\n')
-output.write(trimmed_seq)
-output.write(sep)
-output.write(new_qual)
 output.close()
-def UMI_processing(pathToFastaFile, output_path):
-if is_gz_file(pathToFastaFile) == True:
-with gzip.open(pathToFastaFile, 'rb') as file:
-f = [x.decode("utf-8") for x in file.readlines()]
-else:
-with open(pathToFastaFile, 'r') as file:
-f = file.readlines()
-lines_parse(f, output_path)
 def main():
 if len(argv) != 3:
 exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")

Mercurial > repos > triasteran > ribogalaxy_umi_processing

comparison UMI_riboseq_processing/UMI.py @ 9:31438c26afec draft