Mercurial > repos > jackcurragh > ribogalaxy_umi_extraction

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/UMI_riboseq_tool/UMI.py	Mon Jul 25 12:22:53 2022 +0000
@@ -0,0 +1,77 @@
+import gzip
+from sys import argv, exit
+from itertools import zip_longest
+from Bio import SeqIO
+
+def grouper(iterable, n, fillvalue=None):
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=fillvalue)
+
+
+def is_gz_file(filepath):
+    with open(filepath, 'rb') as test_f:
+        return test_f.read(2) == b'\x1f\x8b'
+
+
+def process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5):
+    '''
+    Write UMI to FASTQ header for given biopython record to given open output
+     UMI_5_prime_length number of bases to trim from 5' and write to header
+     UMI_3_prime_length number of bases to trim from 3' and write to header
+     defaults are for McGlincy Ingolia Protocol
+    '''
+    lines = record.format('fastq').split('\n') # all 4 lines
+    header = lines[0]
+    seq = lines[1]
+    sep = lines[2]
+    qual = lines[3]
+
+    if (header.startswith('@')):
+        trimmed_seq = seq[UMI_5_prime_length:-(UMI_3_prime_length + 1)] # fooprint + barcode
+        UMI = seq[0:UMI_5_prime_length]+seq.rstrip()[-UMI_3_prime_length:len(seq)] #7nt in total; 5'NN and last 3'NNNNN
+        split_header = header.split(" ")
+        new_header = split_header[0]+"_"+UMI+" "+split_header[1]
+        new_qual = qual[UMI_5_prime_length:-(UMI_3_prime_length + 1)]
+        output.write(new_header+'\n')
+        output.write(trimmed_seq+'\n')
+        output.write(sep+'\n')
+        output.write(new_qual+'\n')
+
+
+
+def UMI_processing(pathToFastaFile, output_path, bool_gzip, UMI_5_prime_length, UMI_3_prime_length):
+
+    if bool_gzip:
+        output = gzip.open(output_path,"wt")
+    else:
+        output = open(output_path, 'w')
+
+    if is_gz_file(pathToFastaFile) == True:
+        print ('file is gzipped fastq')
+        with gzip.open(pathToFastaFile, "rt") as handle:
+            for i, record in enumerate(SeqIO.parse(handle, "fastq")):
+                process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5)
+
+    else:
+        for record in SeqIO.parse(pathToFastaFile, 'fastq'):
+            process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length)
+
+    output.close()
+
+
+
+
+def main():
+    if len(argv) != 6:
+        exit("Usage: 3 arguments required\n1: Path to fasta file \n2: name of output file\n3: string 'True' or 'False' whether to gzip output")
+
+    # Get paths
+    pathToFastaFile = argv[1]
+    output = argv[2]
+    bool_gzip = True if argv[3].lower().capitalize() == 'True' else False
+    UMI_5_prime_length = int(argv[4])
+    UMI_3_prime_length = int(argv[5])
+    UMI_processing(pathToFastaFile, output, bool_gzip, UMI_5_prime_length, UMI_3_prime_length)
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/UMI_riboseq_tool/UMI_riboseq.xml	Mon Jul 25 12:22:53 2022 +0000
@@ -0,0 +1,36 @@
+<tool id="UMI_riboseq" name="UMIs to Header" version="0.1.8">
+<requirements>
+    <requirement type="package" version="1.75">biopython</requirement>
+</requirements>
+<command detect_errors="exit_code">
+<![CDATA[ python3 '$__tool_directory__/UMI.py' $reads $output $gzip $five_prime_UMI_length $three_prime_UMI_length]]>
+</command>
+<inputs>
+    <param format="fastqsanger,fastqsanger.gz" name="reads" type="data" label="fastqsanger,fastqsanger.gz"/>
+    <param name="gzip" type="boolean" truevalue="true" falsevalue="false" checked="True" label="Gzip the outputted FASTQ" />
+    <param name="five_prime_UMI_length" type="integer" value=2 label="Number of UMI bases at the 5' end" help="2 for McGlincy Ingolia Protocol" />
+    <param name="three_prime_UMI_length" type="integer" value=5 label="Number of UMI bases at the 3' end)" help="5 for McGlincy Ingolia Protocol" />
+
+</inputs>
+<outputs>
+    <data format="fastqsanger" name="output"/>
+</outputs>
+<tests>
+    <test>
+        <param name="reads" value="sub_10k_reads.fq.gz"/>
+        <output name="output" file="output"/>
+    </test>
+    <test>
+        <param name="reads" value="sub_10k_reads2.fq"/>
+        <output name="output" file="output2"/>
+    </test>
+</tests>
+<help>
+<![CDATA[ fastq/fastq.gz are input files with reads containing UMIs (already demultiplexed and adapters are removed).
+          The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read. It is designed for protocol: McGlincy NJ, Ingolia NT. Transcriptome-wide measurement of translation by ribosome profiling. Methods. 2017;126:112-129. doi:10.1016/j.ymeth.2017.05.028 ]]>
+</help>
+<citations>
+<citation type="bibtex"> @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, }
+</citation>
+</citations>
+</tool>