# HG changeset patch # User artbio # Date 1501110908 14400 # Node ID 6b8adacd475094ef4dcdc90cdabcceb654284631 # Parent f59c643b00fc46cfe497d0ea62f17e8e99acc0ad planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit fa65a844f9041a83767f5305ab360abfdf68f59f diff -r f59c643b00fc -r 6b8adacd4750 mature_mir_gff_translation.py --- a/mature_mir_gff_translation.py Tue Jul 25 11:04:28 2017 -0400 +++ b/mature_mir_gff_translation.py Wed Jul 26 19:15:08 2017 -0400 @@ -1,6 +1,5 @@ #!/usr/bin/env python -import sys import argparse @@ -9,24 +8,27 @@ the_parser.add_argument( '--input', action="store", type=str, help="input miRBase GFF3 file") the_parser.add_argument( - '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") + '--output', action="store", type=str, + help="output GFF3 file with converted mature mir coordinates") args = the_parser.parse_args() return args -GFF3_header= '''##gff-version 3 + +GFF3_header = '''##gff-version 3 ##generated by mature_mir_gff_translation.py # # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** # microRNAs: miRBase current_version # genome-build-id: check http://mirbase.org/ # -# Hairpin precursor sequences have type "miRNA_primary_transcript". -# Note, these sequences do not represent the full primary transcript, -# rather a predicted stem-loop portion that includes the precursor +# Hairpin precursor sequences have type "miRNA_primary_transcript". +# Note, these sequences do not represent the full primary transcript, +# rather a predicted stem-loop portion that includes the precursor # miRNA. Mature sequences have type "miRNA". # ''' + def load_gff_in_dict(gff_input_file): ''' Reads the gff3 file and return a dictionary of dictionaries @@ -35,26 +37,29 @@ ''' gff_dict = {} for line in open(gff_input_file, "r"): - if line[0]=="#": + if line[0] == "#": continue - gff_fields=line[:-1].split("\t") - ID=gff_fields[8].split("ID=")[1].split(";")[0] + gff_fields = line[:-1].split("\t") + ID = gff_fields[8].split("ID=")[1].split(";")[0] gff_dict[ID] = {} - gff_dict[ID]["seqid"]=gff_fields[0] - gff_dict[ID]["source"]=gff_fields[1] - gff_dict[ID]["type"]=gff_fields[2] - gff_dict[ID]["start"]=gff_fields[3] - gff_dict[ID]["end"]=gff_fields[4] - gff_dict[ID]["score"]=gff_fields[5] - gff_dict[ID]["strand"]=gff_fields[6] - gff_dict[ID]["phase"]=gff_fields[7] - gff_dict[ID]["attributes"]=gff_fields[8] + gff_dict[ID]["seqid"] = gff_fields[0] + gff_dict[ID]["source"] = gff_fields[1] + gff_dict[ID]["type"] = gff_fields[2] + gff_dict[ID]["start"] = gff_fields[3] + gff_dict[ID]["end"] = gff_fields[4] + gff_dict[ID]["score"] = gff_fields[5] + gff_dict[ID]["strand"] = gff_fields[6] + gff_dict[ID]["phase"] = gff_fields[7] + gff_dict[ID]["attributes"] = gff_fields[8] if "Derives_from" in gff_dict[ID]["attributes"]: - parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1] - parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1] - gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript) + parent_primary_transcript = gff_dict[ID]["attributes"].split( + "Derives_from=")[1] + parent_primary_transcript = gff_dict[parent_primary_transcript][ + "attributes"].split("Name=")[1] + gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( + gff_dict[ID]["attributes"], parent_primary_transcript) return gff_dict - + def genome_to_mir_gff(gff_dict, output): ''' @@ -63,36 +68,50 @@ Note that GFF files are 1-based coordinates ''' for key in gff_dict: - name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] - gff_dict[key]["seqid"]=name + name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] + gff_dict[key]["seqid"] = name if "Derives_from=" in gff_dict[key]["attributes"]: - parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0] - gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1) - gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1) - hairpins={} - matures={} - for key in gff_dict: ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion - if gff_dict[key]["type"]=="miRNA_primary_transcript": - gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1) - gff_dict[key]["start"]="1" + parent_ID = gff_dict[key]["attributes"].split( + "Derives_from=")[1].split(";")[0] + gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( + gff_dict[parent_ID]["start"])+1) + gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( + gff_dict[parent_ID]["start"])+1) + hairpins = {} + matures = {} + # treats miRNA_primary_transcript coordinates + # in a second loop to avoid errors in conversion + for key in gff_dict: + if gff_dict[key]["type"] == "miRNA_primary_transcript": + gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( + gff_dict[key]["start"]) + 1) + gff_dict[key]["start"] = '1' # now, do a dict[ID]=Name but only for miRNA_primary_transcript - hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] + hairpins[key] = gff_dict[key]["attributes"].split( + "Name=")[1].split( + ";")[0] else: - matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] + matures[key] = gff_dict[key]["attributes"].split( + "Name=")[1].split( + ";")[0] with open(output, "w") as output: output.write(GFF3_header) for ID in sorted(hairpins, key=hairpins.get): - output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], - gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"], - gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"], - gff_dict[ID]["attributes"]])) + output.write("\t".join([gff_dict[ID]["seqid"], + gff_dict[ID]["source"], gff_dict[ID]["type"], + gff_dict[ID]["start"], gff_dict[ID]["end"], + gff_dict[ID]["score"], gff_dict[ID]["strand"], + gff_dict[ID]["phase"], gff_dict[ID]["attributes"]])) output.write("\n") for id in sorted(matures, key=matures.get, reverse=True): if ID in gff_dict[id]["attributes"]: - output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"], - gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"], - gff_dict[id]["score"], gff_dict[id]["strand"], - gff_dict[id]["phase"], gff_dict[id]["attributes"]])) + output.write("\t".join([gff_dict[id]["seqid"], + gff_dict[id]["source"], gff_dict[id]["type"], + gff_dict[id]["start"], gff_dict[id]["end"], + gff_dict[id]["score"], + gff_dict[id]["strand"], + gff_dict[id]["phase"], + gff_dict[id]["attributes"]])) output.write("\n") @@ -103,4 +122,4 @@ if __name__ == "__main__": args = Parser() - main(args.input, args.output) \ No newline at end of file + main(args.input, args.output) diff -r f59c643b00fc -r 6b8adacd4750 mircounts.xml --- a/mircounts.xml Tue Jul 25 11:04:28 2017 -0400 +++ b/mircounts.xml Wed Jul 26 19:15:08 2017 -0400 @@ -1,4 +1,4 @@ - + Counts miRNA alignments from small RNA sequence data gnu-wget diff -r f59c643b00fc -r 6b8adacd4750 test-data/translated_dme.gff3 --- a/test-data/translated_dme.gff3 Tue Jul 25 11:04:28 2017 -0400 +++ b/test-data/translated_dme.gff3 Wed Jul 26 19:15:08 2017 -0400 @@ -5,9 +5,9 @@ # microRNAs: miRBase current_version # genome-build-id: check http://mirbase.org/ # -# Hairpin precursor sequences have type "miRNA_primary_transcript". -# Note, these sequences do not represent the full primary transcript, -# rather a predicted stem-loop portion that includes the precursor +# Hairpin precursor sequences have type "miRNA_primary_transcript". +# Note, these sequences do not represent the full primary transcript, +# rather a predicted stem-loop portion that includes the precursor # miRNA. Mature sequences have type "miRNA". # dme-bantam . miRNA_primary_transcript 1 81 . + . ID=MI0000387;Alias=MI0000387;Name=dme-bantam diff -r f59c643b00fc -r 6b8adacd4750 yac.py --- a/yac.py Tue Jul 25 11:04:28 2017 -0400 +++ b/yac.py Wed Jul 26 19:15:08 2017 -0400 @@ -2,11 +2,8 @@ # yac = yet another clipper # v 1.2.1 - 23-08-2014 - Support FastQ output # v 1.1.0 - 23-08-2014 - argparse implementation -# Usage yac.py $input $output $adapter_to_clip $min $max $Nmode # Christophe Antoniewski -import sys -import string import argparse from itertools import islice @@ -16,17 +13,23 @@ the_parser.add_argument( '--input', action="store", nargs='+', help="input fastq files") the_parser.add_argument( - '--output', action="store", type=str, help="output, clipped fasta file") + '--output', action="store", type=str, + help="output, clipped fasta file") the_parser.add_argument( - '--output_format', action="store", type=str, help="output format, fasta or fastq") + '--output_format', action="store", type=str, + help="output format, fasta or fastq") the_parser.add_argument( - '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") + '--adapter_to_clip', action="store", type=str, + help="adapter sequence to clip") the_parser.add_argument( - '--min', action="store", type=int, help="minimal size of clipped sequence to keep") + '--min', action="store", type=int, + help="minimal size of clipped sequence to keep") the_parser.add_argument( - '--max', action="store", type=int, help="maximal size of clipped sequence to keep") + '--max', action="store", type=int, + help="maximal size of clipped sequence to keep") the_parser.add_argument('--Nmode', action="store", type=str, choices=[ - "accept", "reject"], help="accept or reject sequences with N for clipping") + "accept", "reject"], + help="accept or reject Ns in clipped sequences") args = the_parser.parse_args() args.adapter_to_clip = args.adapter_to_clip.upper() return args @@ -34,7 +37,8 @@ class Clip: - def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode): + def __init__(self, inputfile, outputfile, output_format, + adapter, minsize, maxsize, Nmode): self.inputfile = inputfile self.outputfile = outputfile self.output_format = output_format @@ -44,9 +48,12 @@ self.Nmode = Nmode def motives(sequence): - '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' + ''' + return a list of motives for perfect (6nt) or + imperfect (7nt with one mismatch) search on import string module + ''' sequencevariants = [ - sequence[0:6]] # initializes the list with the 6mer perfect match + sequence[0:6]] # initializes list with 6mer perfect match dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} for pos in enumerate(sequence[:6]): for subst in dicsubst[pos[1]]: @@ -100,6 +107,7 @@ instanceClip = Clip(*argv) instanceClip.handle_io() + if __name__ == "__main__": args = Parser() id = 0