comparison readmap.py @ 8:be0c6b6466cc draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 97b40d7a593cef6c3303f7baba781a84d242e454
author mvdbeek
date Mon, 19 Sep 2016 06:16:21 -0400
parents bcc0c7093e7a
children
comparison
equal deleted inserted replaced
7:c9e267cb84c0 8:be0c6b6466cc
21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") 21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") 22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") 23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") 24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") 25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
26 the_parser.add_argument('--rcode', type=str, help="R script")
27 args = the_parser.parse_args() 26 args = the_parser.parse_args()
28 return args 27 return args
29 28
30 args=Parser() 29 args=Parser()
31 if args.reference_fasta: 30 if args.reference_fasta:
36 genomeRefFile = args.reference_bowtie_index 35 genomeRefFile = args.reference_bowtie_index
37 readmap_file=args.output_readmap 36 readmap_file=args.output_readmap
38 size_distribution_file=args.output_size_distribution 37 size_distribution_file=args.output_size_distribution
39 minquery=args.minquery 38 minquery=args.minquery
40 maxquery=args.maxquery 39 maxquery=args.maxquery
41 Rcode = args.rcode
42 filePath=args.input 40 filePath=args.input
43 fileExt=args.ext 41 fileExt=args.ext
44 fileLabel=args.label 42 fileLabel=args.label
45 normalization_factor=args.normalization_factor 43 normalization_factor=args.normalization_factor
46 44
52 print fileLabel[i] 50 print fileLabel[i]
53 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ 51 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
54 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) 52 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
55 return MasterListOfGenomes 53 return MasterListOfGenomes
56 54
57 def dataframe_sanityzer (listofdatalines): 55 def remove_null_entries(listofdatalines):
58 Dict = defaultdict(float) 56 """
57 This function removes genes that have no reads aligned.
58 """
59 Dict = defaultdict(float)
59 for line in listofdatalines: 60 for line in listofdatalines:
60 fields= line.split("\t") 61 fields= line.split("\t")
61 Dict[fields[0]] += float (fields[2]) 62 Dict[fields[0]] += abs(float(fields[2]))
62 filtered_list = [] 63 filtered_list = []
63 for line in listofdatalines: 64 for line in listofdatalines:
64 fields= line.split("\t") 65 fields= line.split("\t")
65 if Dict[fields[0]] != 0: 66 if Dict[fields[0]] != 0:
66 filtered_list.append(line) 67 filtered_list.append(line)
67 return filtered_list 68 return filtered_list
68 69
69 70
70 def listify_plottable_item(item): 71 def listify_plottable_item(item):
71 """ 72 """
108 dict=readDict[sample].instanceDict 109 dict=readDict[sample].instanceDict
109 for gene in dict.keys(): 110 for gene in dict.keys():
110 plottable = dict[gene].readplot() 111 plottable = dict[gene].readplot()
111 plottable = handle_start_stop_coordinates(plottable, readDict) 112 plottable = handle_start_stop_coordinates(plottable, readDict)
112 for line in plottable: 113 for line in plottable:
113 #print >>readmap, "%s\t%s" % (line, sample)
114 listoflines.append ("%s\t%s" % (line, sample)) 114 listoflines.append ("%s\t%s" % (line, sample))
115 listoflines = dataframe_sanityzer(listoflines) 115 listoflines = remove_null_entries(listoflines)
116 for line in listoflines: 116 for line in listoflines:
117 print >>readmap, line 117 print >>readmap, line
118 118
119 def write_size_distribution_dataframe(readDict, size_distribution_file): 119 def write_size_distribution_dataframe(readDict, size_distribution_file):
120 listoflines = [] 120 listoflines = []
124 if args.gff: 124 if args.gff:
125 dict=readDict[sample] 125 dict=readDict[sample]
126 else: 126 else:
127 dict=readDict[sample].instanceDict 127 dict=readDict[sample].instanceDict
128 for gene in dict.keys(): 128 for gene in dict.keys():
129 histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery) 129 histogram = dict[gene].size_histogram(minquery=minquery, maxquery=maxquery)
130 for polarity in histogram.keys(): 130 for polarity in histogram.keys():
131 if polarity=='both': 131 if polarity=='both':
132 continue 132 continue
133 #for size in xrange(args.minquery, args.maxquery):
134 # if not size in histogram[polarity].keys():
135 # histogram[size]=0
136 for size, count in histogram[polarity].iteritems(): 133 for size, count in histogram[polarity].iteritems():
137 #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly
138 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) 134 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) )
139 listoflines = dataframe_sanityzer(listoflines) 135 listoflines = remove_null_entries(listoflines)
140 for line in listoflines: 136 for line in listoflines:
141 print >>size_distrib, line 137 print >>size_distrib, line
142 138
143 def gff_item_subinstances(readDict, gff3): 139 def gff_item_subinstances(readDict, gff3):
144 GFFinstanceDict=OrderedDict() 140 GFFinstanceDict=OrderedDict()
145 for sample in readDict.keys(): 141 for sample in readDict.keys():
146 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) 142 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class)
152 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name 148 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
153 item_upstream_coordinate = int(gff_fields[3]) 149 item_upstream_coordinate = int(gff_fields[3])
154 item_downstream_coordinate = int(gff_fields[4]) 150 item_downstream_coordinate = int(gff_fields[4])
155 item_polarity = gff_fields[6] 151 item_polarity = gff_fields[6]
156 for sample in readDict.keys(): 152 for sample in readDict.keys():
157 ## this is not required anymore but test
158 # if not GFFinstanceDict.has_key(sample):
159 # GFFinstanceDict[sample]={}
160 ####
161 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) 153 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
162 if item_polarity == '-': 154 if item_polarity == '-':
163 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} 155 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
164 subinstance.gene=gff_name 156 subinstance.gene=gff_name
165 GFFinstanceDict[sample][gff_name]=subinstance 157 GFFinstanceDict[sample][gff_name]=subinstance
170 if args.gff: 162 if args.gff:
171 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) 163 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
172 164
173 write_readplot_dataframe(MasterListOfGenomes, readmap_file) 165 write_readplot_dataframe(MasterListOfGenomes, readmap_file)
174 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) 166 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file)
175
176 R_command="Rscript "+ Rcode
177 process = subprocess.Popen(R_command.split())
178 process.wait()
179 167