Mercurial > repos > drosofff > msp_sr_readmap_and_size_histograms
diff readmap.py @ 8:be0c6b6466cc draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 97b40d7a593cef6c3303f7baba781a84d242e454
author | mvdbeek |
---|---|
date | Mon, 19 Sep 2016 06:16:21 -0400 |
parents | bcc0c7093e7a |
children |
line wrap: on
line diff
--- a/readmap.py Sun Sep 18 12:55:27 2016 -0400 +++ b/readmap.py Mon Sep 19 06:16:21 2016 -0400 @@ -23,7 +23,6 @@ the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") the_parser.add_argument('--minquery', type=int, help="Minimum readsize") the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") - the_parser.add_argument('--rcode', type=str, help="R script") args = the_parser.parse_args() return args @@ -38,7 +37,6 @@ size_distribution_file=args.output_size_distribution minquery=args.minquery maxquery=args.maxquery -Rcode = args.rcode filePath=args.input fileExt=args.ext fileLabel=args.label @@ -54,16 +52,19 @@ biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) return MasterListOfGenomes -def dataframe_sanityzer (listofdatalines): - Dict = defaultdict(float) +def remove_null_entries(listofdatalines): + """ + This function removes genes that have no reads aligned. + """ + Dict = defaultdict(float) for line in listofdatalines: fields= line.split("\t") - Dict[fields[0]] += float (fields[2]) + Dict[fields[0]] += abs(float(fields[2])) filtered_list = [] for line in listofdatalines: fields= line.split("\t") if Dict[fields[0]] != 0: - filtered_list.append(line) + filtered_list.append(line) return filtered_list @@ -110,9 +111,8 @@ plottable = dict[gene].readplot() plottable = handle_start_stop_coordinates(plottable, readDict) for line in plottable: - #print >>readmap, "%s\t%s" % (line, sample) listoflines.append ("%s\t%s" % (line, sample)) - listoflines = dataframe_sanityzer(listoflines) + listoflines = remove_null_entries(listoflines) for line in listoflines: print >>readmap, line @@ -126,19 +126,15 @@ else: dict=readDict[sample].instanceDict for gene in dict.keys(): - histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery) + histogram = dict[gene].size_histogram(minquery=minquery, maxquery=maxquery) for polarity in histogram.keys(): if polarity=='both': continue - #for size in xrange(args.minquery, args.maxquery): - # if not size in histogram[polarity].keys(): - # histogram[size]=0 for size, count in histogram[polarity].iteritems(): - #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) - listoflines = dataframe_sanityzer(listoflines) + listoflines = remove_null_entries(listoflines) for line in listoflines: - print >>size_distrib, line + print >>size_distrib, line def gff_item_subinstances(readDict, gff3): GFFinstanceDict=OrderedDict() @@ -154,10 +150,6 @@ item_downstream_coordinate = int(gff_fields[4]) item_polarity = gff_fields[6] for sample in readDict.keys(): -## this is not required anymore but test -# if not GFFinstanceDict.has_key(sample): -# GFFinstanceDict[sample]={} -#### subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) if item_polarity == '-': subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} @@ -172,8 +164,4 @@ write_readplot_dataframe(MasterListOfGenomes, readmap_file) write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) - -R_command="Rscript "+ Rcode -process = subprocess.Popen(R_command.split()) -process.wait()