diff UMI_riboseq_processing/UMI.py @ 9:31438c26afec draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 14:41:24 +0000
parents 701804f5ad4b
children
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py	Tue Jun 21 13:22:08 2022 +0000
+++ b/UMI_riboseq_processing/UMI.py	Tue Jun 21 14:41:24 2022 +0000
@@ -6,6 +6,8 @@
 from itertools import zip_longest
 import subprocess
 from subprocess import call
+import Bio
+from Bio import SeqIO
 
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
@@ -17,42 +19,53 @@
         return test_f.read(2) == b'\x1f\x8b'
 
 
-def lines_parse(f, output_path): 
+def UMI_processing(pathToFastaFile, output_path):
+    
     output = open(output_path,"w")
-    for lines in grouper(f, 4, "\n"):
-        header = lines[0]
-        #print (header)
-        seq = lines[1]
-        sep = lines[2]
-        qual = lines[3]
-        # check if  header is OK
-        if (header.startswith('@')):
-            trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
-            UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
+    
+    if is_gz_file(pathToFastaFile) == True: 
+        print ('file is gzipped fastq')
+        with gzip.open(pathToFastaFile, "rt") as handle:
+            for i, record in enumerate(SeqIO.parse(handle, "fastq")):
+                lines = record.format('fastq').split('\n') # all 4 lines 
+                header = lines[0]
+                if i % 100000 == 0:
+                    print ('read number %s' % i)
+                seq = lines[1]
+                sep = lines[2]
+                qual = lines[3]
+                if (header.startswith('@')):
+                    trimmed_seq = seq[2:-6] # fooprint + barcode
+                    UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
+                    split_header = header.split(" ")
+                    new_header = split_header[0]+"_"+UMI+" "+split_header[1]
+                    new_qual = qual[2:-6]
+                    output.write(new_header+'\n')
+                    output.write(trimmed_seq+'\n')
+                    output.write(sep+'\n')
+                    output.write(new_qual+'\n')   
+                    
+    else: 
+        for record in SeqIO.parse(pathToFastaFile, 'fastq'):
+            lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality 
+            header = lines[0]
+            seq = lines[1]
+            sep = lines[2]
+            qual = lines[3]
+            trimmed_seq = seq[2:-6] # fooprint + barcode
+            UMI = seq[0:2]+seq.rstrip()[-5:len(seq)] #7nt in total; 5'NN and last 3'NNNNN  
             split_header = header.split(" ")
             new_header = split_header[0]+"_"+UMI+" "+split_header[1]
-            if qual[-1:] == "\n":
-                new_qual = qual[2:-6]+"\n"
-            else:
-                new_qual = qual[2:-6]
-            output.write(new_header)
-            output.write(trimmed_seq)
-            output.write(sep)
-            output.write(new_qual)
+            new_qual = qual[2:-6]
+            output.write(new_header+'\n')
+            output.write(trimmed_seq+'\n') 
+            output.write(sep+'\n') 
+            output.write(new_qual+'\n')
+            
     output.close()
-
+            
 
-def UMI_processing(pathToFastaFile, output_path):
     
-    if is_gz_file(pathToFastaFile) == True: 
-        with gzip.open(pathToFastaFile, 'rb') as file:
-            f = [x.decode("utf-8") for x in file.readlines()]
-            
-    else: 
-        with open(pathToFastaFile, 'r') as file:
-            f = file.readlines()
-
-    lines_parse(f, output_path)
 
 def main():
     if len(argv) != 3: