Mercurial > repos > jackcurragh > ribogalaxy_umi_extraction
changeset 0:393b73b2eb5f draft
Uploaded
author | jackcurragh |
---|---|
date | Mon, 25 Jul 2022 12:22:53 +0000 |
parents | |
children | 4605a1af1f3d |
files | UMI_riboseq_tool/UMI.py UMI_riboseq_tool/UMI_riboseq.xml |
diffstat | 2 files changed, 113 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UMI_riboseq_tool/UMI.py Mon Jul 25 12:22:53 2022 +0000 @@ -0,0 +1,77 @@ +import gzip +from sys import argv, exit +from itertools import zip_longest +from Bio import SeqIO + +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + + +def is_gz_file(filepath): + with open(filepath, 'rb') as test_f: + return test_f.read(2) == b'\x1f\x8b' + + +def process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5): + ''' + Write UMI to FASTQ header for given biopython record to given open output + UMI_5_prime_length number of bases to trim from 5' and write to header + UMI_3_prime_length number of bases to trim from 3' and write to header + defaults are for McGlincy Ingolia Protocol + ''' + lines = record.format('fastq').split('\n') # all 4 lines + header = lines[0] + seq = lines[1] + sep = lines[2] + qual = lines[3] + + if (header.startswith('@')): + trimmed_seq = seq[UMI_5_prime_length:-(UMI_3_prime_length + 1)] # fooprint + barcode + UMI = seq[0:UMI_5_prime_length]+seq.rstrip()[-UMI_3_prime_length:len(seq)] #7nt in total; 5'NN and last 3'NNNNN + split_header = header.split(" ") + new_header = split_header[0]+"_"+UMI+" "+split_header[1] + new_qual = qual[UMI_5_prime_length:-(UMI_3_prime_length + 1)] + output.write(new_header+'\n') + output.write(trimmed_seq+'\n') + output.write(sep+'\n') + output.write(new_qual+'\n') + + + +def UMI_processing(pathToFastaFile, output_path, bool_gzip, UMI_5_prime_length, UMI_3_prime_length): + + if bool_gzip: + output = gzip.open(output_path,"wt") + else: + output = open(output_path, 'w') + + if is_gz_file(pathToFastaFile) == True: + print ('file is gzipped fastq') + with gzip.open(pathToFastaFile, "rt") as handle: + for i, record in enumerate(SeqIO.parse(handle, "fastq")): + process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5) + + else: + for record in SeqIO.parse(pathToFastaFile, 'fastq'): + process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length) + + output.close() + + + + +def main(): + if len(argv) != 6: + exit("Usage: 3 arguments required\n1: Path to fasta file \n2: name of output file\n3: string 'True' or 'False' whether to gzip output") + + # Get paths + pathToFastaFile = argv[1] + output = argv[2] + bool_gzip = True if argv[3].lower().capitalize() == 'True' else False + UMI_5_prime_length = int(argv[4]) + UMI_3_prime_length = int(argv[5]) + UMI_processing(pathToFastaFile, output, bool_gzip, UMI_5_prime_length, UMI_3_prime_length) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UMI_riboseq_tool/UMI_riboseq.xml Mon Jul 25 12:22:53 2022 +0000 @@ -0,0 +1,36 @@ +<tool id="UMI_riboseq" name="UMIs to Header" version="0.1.8"> +<requirements> + <requirement type="package" version="1.75">biopython</requirement> +</requirements> +<command detect_errors="exit_code"> +<![CDATA[ python3 '$__tool_directory__/UMI.py' $reads $output $gzip $five_prime_UMI_length $three_prime_UMI_length]]> +</command> +<inputs> + <param format="fastqsanger,fastqsanger.gz" name="reads" type="data" label="fastqsanger,fastqsanger.gz"/> + <param name="gzip" type="boolean" truevalue="true" falsevalue="false" checked="True" label="Gzip the outputted FASTQ" /> + <param name="five_prime_UMI_length" type="integer" value=2 label="Number of UMI bases at the 5' end" help="2 for McGlincy Ingolia Protocol" /> + <param name="three_prime_UMI_length" type="integer" value=5 label="Number of UMI bases at the 3' end)" help="5 for McGlincy Ingolia Protocol" /> + +</inputs> +<outputs> + <data format="fastqsanger" name="output"/> +</outputs> +<tests> + <test> + <param name="reads" value="sub_10k_reads.fq.gz"/> + <output name="output" file="output"/> + </test> + <test> + <param name="reads" value="sub_10k_reads2.fq"/> + <output name="output" file="output2"/> + </test> +</tests> +<help> +<![CDATA[ fastq/fastq.gz are input files with reads containing UMIs (already demultiplexed and adapters are removed). + The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read. It is designed for protocol: McGlincy NJ, Ingolia NT. Transcriptome-wide measurement of translation by ribosome profiling. Methods. 2017;126:112-129. doi:10.1016/j.ymeth.2017.05.028 ]]> +</help> +<citations> +<citation type="bibtex"> @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, } +</citation> +</citations> +</tool>