diff UMI_riboseq_processing/UMI.py @ 8:701804f5ad4b draft

Uploaded
author triasteran
date Tue, 21 Jun 2022 13:22:08 +0000
parents be394fb47250
children 31438c26afec
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py	Tue Jun 21 09:50:49 2022 +0000
+++ b/UMI_riboseq_processing/UMI.py	Tue Jun 21 13:22:08 2022 +0000
@@ -4,48 +4,56 @@
 from sys import argv, exit
 import itertools
 from itertools import zip_longest
-
+import subprocess
+from subprocess import call
 
 def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
     return zip_longest(*args, fillvalue=fillvalue)
 
 
-chunk_size=4
+def is_gz_file(filepath):
+    with open(filepath, 'rb') as test_f:
+        return test_f.read(2) == b'\x1f\x8b'
+
 
-def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output):
-    # find wheather its plain or gzipped fastq
-    encoding = guess_type(pathToFastaFile)[1]  # uses file extension
-    _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
-    # output file will be in gz format
-    output = gzip.open(output,"wt")
-    # open and parse
-    with _open(pathToFastaFile) as f:
-        for lines in grouper(f, chunk_size, ""):
-            #lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality
-            header = lines[0]
-            seq = lines[1]
-            sep = lines[2]
-            qual = lines[3]
-            # check if  header is OK 
-            if (header.startswith('@')):
-                trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
-                UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
-                split_header = header.split(" ")
-                print (split_header)
-                new_header = split_header[0]+"_"+UMI+" "+split_header[1]
-                if qual[-1:] == "\n":
-                    new_qual = qual[2:-6]+"\n"
-                else:
-                    new_qual = qual[2:-6]
-                output.write(new_header)
-                output.write(trimmed_seq)
-                output.write(sep)
-                output.write(new_qual)
-
+def lines_parse(f, output_path): 
+    output = open(output_path,"w")
+    for lines in grouper(f, 4, "\n"):
+        header = lines[0]
+        #print (header)
+        seq = lines[1]
+        sep = lines[2]
+        qual = lines[3]
+        # check if  header is OK
+        if (header.startswith('@')):
+            trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode
+            UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN
+            split_header = header.split(" ")
+            new_header = split_header[0]+"_"+UMI+" "+split_header[1]
+            if qual[-1:] == "\n":
+                new_qual = qual[2:-6]+"\n"
+            else:
+                new_qual = qual[2:-6]
+            output.write(new_header)
+            output.write(trimmed_seq)
+            output.write(sep)
+            output.write(new_qual)
     output.close()
 
+
+def UMI_processing(pathToFastaFile, output_path):
     
+    if is_gz_file(pathToFastaFile) == True: 
+        with gzip.open(pathToFastaFile, 'rb') as file:
+            f = [x.decode("utf-8") for x in file.readlines()]
+            
+    else: 
+        with open(pathToFastaFile, 'r') as file:
+            f = file.readlines()
+
+    lines_parse(f, output_path)
+
 def main():
     if len(argv) != 3:
         exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file")
@@ -53,7 +61,7 @@
     # Get paths
     pathToFastaFile = argv[1]
     output = argv[2]
-    copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output)
+    UMI_processing(pathToFastaFile, output)
 
 if __name__ == "__main__":
     main()