small_read_size_histograms: size_histogram.py comparison

comparison size_histogram.py @ 0:234b83159ea8 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_read_size_histograms commit ab983b2e57321e8913bd4d5f8fc89c3223c69869

author	artbio
date	Tue, 11 Jul 2017 11:44:36 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:234b83159ea8
+#!/usr/bin/python
+# python parser module for size distributions, guided by GFF3
+import argparse
+import subprocess
+from collections import OrderedDict
+from smRtools import extractsubinstance
+from smRtools import HandleSmRNAwindows
+def Parser():
+the_parser = argparse.ArgumentParser()
+the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
+the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
+the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
+the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
+the_parser.add_argument('--ext',nargs='+', help="input file type")
+the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
+the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
+the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
+the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
+the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
+the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calculated for the sum of all items")
+the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed")
+args = the_parser.parse_args()
+return args
+args=Parser()
+if args.reference_fasta:
+genomeRefFormat = "fastaSource"
+genomeRefFile = args.reference_fasta
+if args.reference_bowtie_index:
+genomeRefFormat = "bowtieIndex"
+genomeRefFile = args.reference_bowtie_index
+size_distribution_file=args.output_size_distribution
+minquery=args.minquery
+maxquery=args.maxquery
+filePath=args.input
+fileExt=args.ext
+fileLabel=args.label
+normalization_factor=args.normalization_factor
+global_size=args.global_size
+collapse=args.collapse
+if collapse:
+pol=["both"]
+else:
+pol=["F", "R"]
+MasterListOfGenomes = OrderedDict()
+def process_samples(filePath):
+for i, filePath in enumerate(filePath):
+norm=normalization_factor[i]
+print fileLabel[i]
+MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
+biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
+return MasterListOfGenomes
+def write_size_distribution_dataframe(readDict, size_distribution_file, pol=["both"] ):
+'''refactored on 7-9-2014'''
+with open(size_distribution_file, 'w') as size_distrib:
+print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
+for sample in readDict.keys():
+if args.gff:
+dict=readDict[sample]
+else:
+dict=readDict[sample].instanceDict
+for gene in dict.keys():
+histogram = dict[gene].size_histogram()
+for polarity in pol:
+for size, count in histogram[polarity].iteritems():
+print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
+def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]):
+with open(size_distribution_file, 'w') as size_distrib:
+print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
+for sample in readDict.keys():
+histogram = readDict[sample].size_histogram()
+gene="sample"
+for polarity in pol:
+for size, count in histogram[polarity].iteritems():
+print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
+def gff_item_subinstances(readDict, gff3):
+GFFinstanceDict=OrderedDict()
+with open(gff3) as gff:
+for line in gff:
+if line[0] == "#": continue
+gff_fields = line[:-1].split("\t")
+chrom = gff_fields[0]
+gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
+item_upstream_coordinate = int(gff_fields[3])
+item_downstream_coordinate = int(gff_fields[4])
+item_polarity = gff_fields[6]
+for sample in readDict.keys():
+if sample not in GFFinstanceDict:
+GFFinstanceDict[sample]={}
+subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
+if item_polarity == '-':
+subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
+#          subinstance.readDict.setdefault(key, [])
+subinstance.gene=gff_name
+GFFinstanceDict[sample][gff_name]=subinstance
+return GFFinstanceDict
+MasterListOfGenomes=process_samples(filePath)
+if args.gff:
+MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
+if global_size:
+write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol)
+else:
+write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file, pol)

Mercurial > repos > artbio > small_read_size_histograms

comparison size_histogram.py @ 0:234b83159ea8 draft