diff readmap.py @ 8:be0c6b6466cc draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 97b40d7a593cef6c3303f7baba781a84d242e454
author mvdbeek
date Mon, 19 Sep 2016 06:16:21 -0400
parents bcc0c7093e7a
children
line wrap: on
line diff
--- a/readmap.py	Sun Sep 18 12:55:27 2016 -0400
+++ b/readmap.py	Mon Sep 19 06:16:21 2016 -0400
@@ -23,7 +23,6 @@
   the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
   the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
   the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
-  the_parser.add_argument('--rcode', type=str, help="R script")
   args = the_parser.parse_args()
   return args
 
@@ -38,7 +37,6 @@
 size_distribution_file=args.output_size_distribution
 minquery=args.minquery
 maxquery=args.maxquery
-Rcode = args.rcode
 filePath=args.input
 fileExt=args.ext
 fileLabel=args.label
@@ -54,16 +52,19 @@
                         biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
   return MasterListOfGenomes
 
-def dataframe_sanityzer (listofdatalines):
-  Dict = defaultdict(float) 
+def remove_null_entries(listofdatalines):
+  """
+  This function removes genes that have no reads aligned.
+  """
+  Dict = defaultdict(float)
   for line in listofdatalines:
     fields= line.split("\t")
-    Dict[fields[0]] += float (fields[2])
+    Dict[fields[0]] += abs(float(fields[2]))
   filtered_list = []
   for line in listofdatalines:
     fields= line.split("\t")
     if Dict[fields[0]] != 0:
-      filtered_list.append(line) 
+      filtered_list.append(line)
   return filtered_list
 
 
@@ -110,9 +111,8 @@
         plottable = dict[gene].readplot()
         plottable = handle_start_stop_coordinates(plottable, readDict)
         for line in plottable:
-          #print >>readmap, "%s\t%s" % (line, sample)
           listoflines.append ("%s\t%s" % (line, sample))
-    listoflines = dataframe_sanityzer(listoflines)
+    listoflines = remove_null_entries(listoflines)
     for line in listoflines:
       print >>readmap, line
 
@@ -126,19 +126,15 @@
       else:
         dict=readDict[sample].instanceDict
       for gene in dict.keys():
-        histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery)
+        histogram = dict[gene].size_histogram(minquery=minquery, maxquery=maxquery)
         for polarity in histogram.keys():
           if polarity=='both':
             continue
-          #for size in xrange(args.minquery, args.maxquery):
-          #  if not size in histogram[polarity].keys():
-          #    histogram[size]=0
           for size, count in histogram[polarity].iteritems():
-            #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly
             listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) )
-    listoflines = dataframe_sanityzer(listoflines)
+    listoflines = remove_null_entries(listoflines)
     for line in listoflines:
-      print >>size_distrib, line  
+      print >>size_distrib, line
 
 def gff_item_subinstances(readDict, gff3):
   GFFinstanceDict=OrderedDict()
@@ -154,10 +150,6 @@
       item_downstream_coordinate = int(gff_fields[4])
       item_polarity = gff_fields[6]
       for sample in readDict.keys():
-## this is not required anymore but test
-#        if not GFFinstanceDict.has_key(sample):
-#          GFFinstanceDict[sample]={}
-####
         subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
         if item_polarity == '-':
           subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
@@ -172,8 +164,4 @@
 
 write_readplot_dataframe(MasterListOfGenomes, readmap_file)
 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file)
-
-R_command="Rscript "+ Rcode
-process = subprocess.Popen(R_command.split())
-process.wait()