Mercurial > repos > drosofff > msp_sr_readmap_and_size_histograms
comparison readmap.py @ 8:be0c6b6466cc draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 97b40d7a593cef6c3303f7baba781a84d242e454
author | mvdbeek |
---|---|
date | Mon, 19 Sep 2016 06:16:21 -0400 |
parents | bcc0c7093e7a |
children |
comparison
equal
deleted
inserted
replaced
7:c9e267cb84c0 | 8:be0c6b6466cc |
---|---|
21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | 21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") |
22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | 22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") |
23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | 23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") |
24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | 24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") |
25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | 25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") |
26 the_parser.add_argument('--rcode', type=str, help="R script") | |
27 args = the_parser.parse_args() | 26 args = the_parser.parse_args() |
28 return args | 27 return args |
29 | 28 |
30 args=Parser() | 29 args=Parser() |
31 if args.reference_fasta: | 30 if args.reference_fasta: |
36 genomeRefFile = args.reference_bowtie_index | 35 genomeRefFile = args.reference_bowtie_index |
37 readmap_file=args.output_readmap | 36 readmap_file=args.output_readmap |
38 size_distribution_file=args.output_size_distribution | 37 size_distribution_file=args.output_size_distribution |
39 minquery=args.minquery | 38 minquery=args.minquery |
40 maxquery=args.maxquery | 39 maxquery=args.maxquery |
41 Rcode = args.rcode | |
42 filePath=args.input | 40 filePath=args.input |
43 fileExt=args.ext | 41 fileExt=args.ext |
44 fileLabel=args.label | 42 fileLabel=args.label |
45 normalization_factor=args.normalization_factor | 43 normalization_factor=args.normalization_factor |
46 | 44 |
52 print fileLabel[i] | 50 print fileLabel[i] |
53 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | 51 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ |
54 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | 52 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) |
55 return MasterListOfGenomes | 53 return MasterListOfGenomes |
56 | 54 |
57 def dataframe_sanityzer (listofdatalines): | 55 def remove_null_entries(listofdatalines): |
58 Dict = defaultdict(float) | 56 """ |
57 This function removes genes that have no reads aligned. | |
58 """ | |
59 Dict = defaultdict(float) | |
59 for line in listofdatalines: | 60 for line in listofdatalines: |
60 fields= line.split("\t") | 61 fields= line.split("\t") |
61 Dict[fields[0]] += float (fields[2]) | 62 Dict[fields[0]] += abs(float(fields[2])) |
62 filtered_list = [] | 63 filtered_list = [] |
63 for line in listofdatalines: | 64 for line in listofdatalines: |
64 fields= line.split("\t") | 65 fields= line.split("\t") |
65 if Dict[fields[0]] != 0: | 66 if Dict[fields[0]] != 0: |
66 filtered_list.append(line) | 67 filtered_list.append(line) |
67 return filtered_list | 68 return filtered_list |
68 | 69 |
69 | 70 |
70 def listify_plottable_item(item): | 71 def listify_plottable_item(item): |
71 """ | 72 """ |
108 dict=readDict[sample].instanceDict | 109 dict=readDict[sample].instanceDict |
109 for gene in dict.keys(): | 110 for gene in dict.keys(): |
110 plottable = dict[gene].readplot() | 111 plottable = dict[gene].readplot() |
111 plottable = handle_start_stop_coordinates(plottable, readDict) | 112 plottable = handle_start_stop_coordinates(plottable, readDict) |
112 for line in plottable: | 113 for line in plottable: |
113 #print >>readmap, "%s\t%s" % (line, sample) | |
114 listoflines.append ("%s\t%s" % (line, sample)) | 114 listoflines.append ("%s\t%s" % (line, sample)) |
115 listoflines = dataframe_sanityzer(listoflines) | 115 listoflines = remove_null_entries(listoflines) |
116 for line in listoflines: | 116 for line in listoflines: |
117 print >>readmap, line | 117 print >>readmap, line |
118 | 118 |
119 def write_size_distribution_dataframe(readDict, size_distribution_file): | 119 def write_size_distribution_dataframe(readDict, size_distribution_file): |
120 listoflines = [] | 120 listoflines = [] |
124 if args.gff: | 124 if args.gff: |
125 dict=readDict[sample] | 125 dict=readDict[sample] |
126 else: | 126 else: |
127 dict=readDict[sample].instanceDict | 127 dict=readDict[sample].instanceDict |
128 for gene in dict.keys(): | 128 for gene in dict.keys(): |
129 histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery) | 129 histogram = dict[gene].size_histogram(minquery=minquery, maxquery=maxquery) |
130 for polarity in histogram.keys(): | 130 for polarity in histogram.keys(): |
131 if polarity=='both': | 131 if polarity=='both': |
132 continue | 132 continue |
133 #for size in xrange(args.minquery, args.maxquery): | |
134 # if not size in histogram[polarity].keys(): | |
135 # histogram[size]=0 | |
136 for size, count in histogram[polarity].iteritems(): | 133 for size, count in histogram[polarity].iteritems(): |
137 #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly | |
138 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) | 134 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) |
139 listoflines = dataframe_sanityzer(listoflines) | 135 listoflines = remove_null_entries(listoflines) |
140 for line in listoflines: | 136 for line in listoflines: |
141 print >>size_distrib, line | 137 print >>size_distrib, line |
142 | 138 |
143 def gff_item_subinstances(readDict, gff3): | 139 def gff_item_subinstances(readDict, gff3): |
144 GFFinstanceDict=OrderedDict() | 140 GFFinstanceDict=OrderedDict() |
145 for sample in readDict.keys(): | 141 for sample in readDict.keys(): |
146 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) | 142 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) |
152 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | 148 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name |
153 item_upstream_coordinate = int(gff_fields[3]) | 149 item_upstream_coordinate = int(gff_fields[3]) |
154 item_downstream_coordinate = int(gff_fields[4]) | 150 item_downstream_coordinate = int(gff_fields[4]) |
155 item_polarity = gff_fields[6] | 151 item_polarity = gff_fields[6] |
156 for sample in readDict.keys(): | 152 for sample in readDict.keys(): |
157 ## this is not required anymore but test | |
158 # if not GFFinstanceDict.has_key(sample): | |
159 # GFFinstanceDict[sample]={} | |
160 #### | |
161 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | 153 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) |
162 if item_polarity == '-': | 154 if item_polarity == '-': |
163 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | 155 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} |
164 subinstance.gene=gff_name | 156 subinstance.gene=gff_name |
165 GFFinstanceDict[sample][gff_name]=subinstance | 157 GFFinstanceDict[sample][gff_name]=subinstance |
170 if args.gff: | 162 if args.gff: |
171 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | 163 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) |
172 | 164 |
173 write_readplot_dataframe(MasterListOfGenomes, readmap_file) | 165 write_readplot_dataframe(MasterListOfGenomes, readmap_file) |
174 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) | 166 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) |
175 | |
176 R_command="Rscript "+ Rcode | |
177 process = subprocess.Popen(R_command.split()) | |
178 process.wait() | |
179 | 167 |