Mercurial > repos > triasteran > ribogalaxy_umi_processing
diff UMI_riboseq_processing/UMI.py @ 9:31438c26afec draft
Uploaded
author | triasteran |
---|---|
date | Tue, 21 Jun 2022 14:41:24 +0000 |
parents | 701804f5ad4b |
children |
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py Tue Jun 21 13:22:08 2022 +0000 +++ b/UMI_riboseq_processing/UMI.py Tue Jun 21 14:41:24 2022 +0000 @@ -6,6 +6,8 @@ from itertools import zip_longest import subprocess from subprocess import call +import Bio +from Bio import SeqIO def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n @@ -17,42 +19,53 @@ return test_f.read(2) == b'\x1f\x8b' -def lines_parse(f, output_path): +def UMI_processing(pathToFastaFile, output_path): + output = open(output_path,"w") - for lines in grouper(f, 4, "\n"): - header = lines[0] - #print (header) - seq = lines[1] - sep = lines[2] - qual = lines[3] - # check if header is OK - if (header.startswith('@')): - trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode - UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN + + if is_gz_file(pathToFastaFile) == True: + print ('file is gzipped fastq') + with gzip.open(pathToFastaFile, "rt") as handle: + for i, record in enumerate(SeqIO.parse(handle, "fastq")): + lines = record.format('fastq').split('\n') # all 4 lines + header = lines[0] + if i % 100000 == 0: + print ('read number %s' % i) + seq = lines[1] + sep = lines[2] + qual = lines[3] + if (header.startswith('@')): + trimmed_seq = seq[2:-6] # fooprint + barcode + UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN + split_header = header.split(" ") + new_header = split_header[0]+"_"+UMI+" "+split_header[1] + new_qual = qual[2:-6] + output.write(new_header+'\n') + output.write(trimmed_seq+'\n') + output.write(sep+'\n') + output.write(new_qual+'\n') + + else: + for record in SeqIO.parse(pathToFastaFile, 'fastq'): + lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality + header = lines[0] + seq = lines[1] + sep = lines[2] + qual = lines[3] + trimmed_seq = seq[2:-6] # fooprint + barcode + UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN split_header = header.split(" ") new_header = split_header[0]+"_"+UMI+" "+split_header[1] - if qual[-1:] == "\n": - new_qual = qual[2:-6]+"\n" - else: - new_qual = qual[2:-6] - output.write(new_header) - output.write(trimmed_seq) - output.write(sep) - output.write(new_qual) + new_qual = qual[2:-6] + output.write(new_header+'\n') + output.write(trimmed_seq+'\n') + output.write(sep+'\n') + output.write(new_qual+'\n') + output.close() - + -def UMI_processing(pathToFastaFile, output_path): - if is_gz_file(pathToFastaFile) == True: - with gzip.open(pathToFastaFile, 'rb') as file: - f = [x.decode("utf-8") for x in file.readlines()] - - else: - with open(pathToFastaFile, 'r') as file: - f = file.readlines() - - lines_parse(f, output_path) def main(): if len(argv) != 3: