# HG changeset patch # User jackcurragh # Date 1658751773 0 # Node ID 393b73b2eb5fe28bfbc33c4a375774cb709e4d01 Uploaded diff -r 000000000000 -r 393b73b2eb5f UMI_riboseq_tool/UMI.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UMI_riboseq_tool/UMI.py Mon Jul 25 12:22:53 2022 +0000 @@ -0,0 +1,77 @@ +import gzip +from sys import argv, exit +from itertools import zip_longest +from Bio import SeqIO + +def grouper(iterable, n, fillvalue=None): + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + + +def is_gz_file(filepath): + with open(filepath, 'rb') as test_f: + return test_f.read(2) == b'\x1f\x8b' + + +def process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5): + ''' + Write UMI to FASTQ header for given biopython record to given open output + UMI_5_prime_length number of bases to trim from 5' and write to header + UMI_3_prime_length number of bases to trim from 3' and write to header + defaults are for McGlincy Ingolia Protocol + ''' + lines = record.format('fastq').split('\n') # all 4 lines + header = lines[0] + seq = lines[1] + sep = lines[2] + qual = lines[3] + + if (header.startswith('@')): + trimmed_seq = seq[UMI_5_prime_length:-(UMI_3_prime_length + 1)] # fooprint + barcode + UMI = seq[0:UMI_5_prime_length]+seq.rstrip()[-UMI_3_prime_length:len(seq)] #7nt in total; 5'NN and last 3'NNNNN + split_header = header.split(" ") + new_header = split_header[0]+"_"+UMI+" "+split_header[1] + new_qual = qual[UMI_5_prime_length:-(UMI_3_prime_length + 1)] + output.write(new_header+'\n') + output.write(trimmed_seq+'\n') + output.write(sep+'\n') + output.write(new_qual+'\n') + + + +def UMI_processing(pathToFastaFile, output_path, bool_gzip, UMI_5_prime_length, UMI_3_prime_length): + + if bool_gzip: + output = gzip.open(output_path,"wt") + else: + output = open(output_path, 'w') + + if is_gz_file(pathToFastaFile) == True: + print ('file is gzipped fastq') + with gzip.open(pathToFastaFile, "rt") as handle: + for i, record in enumerate(SeqIO.parse(handle, "fastq")): + process_fastq_record(record, output, UMI_5_prime_length=2, UMI_3_prime_length=5) + + else: + for record in SeqIO.parse(pathToFastaFile, 'fastq'): + process_fastq_record(record, output, UMI_5_prime_length, UMI_3_prime_length) + + output.close() + + + + +def main(): + if len(argv) != 6: + exit("Usage: 3 arguments required\n1: Path to fasta file \n2: name of output file\n3: string 'True' or 'False' whether to gzip output") + + # Get paths + pathToFastaFile = argv[1] + output = argv[2] + bool_gzip = True if argv[3].lower().capitalize() == 'True' else False + UMI_5_prime_length = int(argv[4]) + UMI_3_prime_length = int(argv[5]) + UMI_processing(pathToFastaFile, output, bool_gzip, UMI_5_prime_length, UMI_3_prime_length) + +if __name__ == "__main__": + main() diff -r 000000000000 -r 393b73b2eb5f UMI_riboseq_tool/UMI_riboseq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UMI_riboseq_tool/UMI_riboseq.xml Mon Jul 25 12:22:53 2022 +0000 @@ -0,0 +1,36 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, } + + +