Mercurial > repos > artbio > small_read_size_histograms
comparison size_histogram.py @ 0:234b83159ea8 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_read_size_histograms commit ab983b2e57321e8913bd4d5f8fc89c3223c69869
author | artbio |
---|---|
date | Tue, 11 Jul 2017 11:44:36 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:234b83159ea8 |
---|---|
1 #!/usr/bin/python | |
2 # python parser module for size distributions, guided by GFF3 | |
3 | |
4 import argparse | |
5 import subprocess | |
6 from collections import OrderedDict | |
7 from smRtools import extractsubinstance | |
8 from smRtools import HandleSmRNAwindows | |
9 | |
10 | |
11 def Parser(): | |
12 the_parser = argparse.ArgumentParser() | |
13 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") | |
14 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") | |
15 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") | |
16 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
17 the_parser.add_argument('--ext',nargs='+', help="input file type") | |
18 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
19 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | |
20 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | |
21 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | |
22 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | |
23 the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calculated for the sum of all items") | |
24 the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed") | |
25 args = the_parser.parse_args() | |
26 return args | |
27 | |
28 | |
29 args=Parser() | |
30 if args.reference_fasta: | |
31 genomeRefFormat = "fastaSource" | |
32 genomeRefFile = args.reference_fasta | |
33 if args.reference_bowtie_index: | |
34 genomeRefFormat = "bowtieIndex" | |
35 genomeRefFile = args.reference_bowtie_index | |
36 size_distribution_file=args.output_size_distribution | |
37 minquery=args.minquery | |
38 maxquery=args.maxquery | |
39 filePath=args.input | |
40 fileExt=args.ext | |
41 fileLabel=args.label | |
42 normalization_factor=args.normalization_factor | |
43 global_size=args.global_size | |
44 collapse=args.collapse | |
45 | |
46 if collapse: | |
47 pol=["both"] | |
48 else: | |
49 pol=["F", "R"] | |
50 | |
51 MasterListOfGenomes = OrderedDict() | |
52 | |
53 def process_samples(filePath): | |
54 for i, filePath in enumerate(filePath): | |
55 norm=normalization_factor[i] | |
56 print fileLabel[i] | |
57 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | |
58 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | |
59 return MasterListOfGenomes | |
60 | |
61 | |
62 def write_size_distribution_dataframe(readDict, size_distribution_file, pol=["both"] ): | |
63 '''refactored on 7-9-2014''' | |
64 with open(size_distribution_file, 'w') as size_distrib: | |
65 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" | |
66 for sample in readDict.keys(): | |
67 if args.gff: | |
68 dict=readDict[sample] | |
69 else: | |
70 dict=readDict[sample].instanceDict | |
71 for gene in dict.keys(): | |
72 histogram = dict[gene].size_histogram() | |
73 for polarity in pol: | |
74 for size, count in histogram[polarity].iteritems(): | |
75 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample) | |
76 | |
77 | |
78 def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]): | |
79 with open(size_distribution_file, 'w') as size_distrib: | |
80 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" | |
81 for sample in readDict.keys(): | |
82 histogram = readDict[sample].size_histogram() | |
83 gene="sample" | |
84 for polarity in pol: | |
85 for size, count in histogram[polarity].iteritems(): | |
86 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample) | |
87 | |
88 | |
89 def gff_item_subinstances(readDict, gff3): | |
90 GFFinstanceDict=OrderedDict() | |
91 with open(gff3) as gff: | |
92 for line in gff: | |
93 if line[0] == "#": continue | |
94 gff_fields = line[:-1].split("\t") | |
95 chrom = gff_fields[0] | |
96 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
97 item_upstream_coordinate = int(gff_fields[3]) | |
98 item_downstream_coordinate = int(gff_fields[4]) | |
99 item_polarity = gff_fields[6] | |
100 for sample in readDict.keys(): | |
101 if sample not in GFFinstanceDict: | |
102 GFFinstanceDict[sample]={} | |
103 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | |
104 if item_polarity == '-': | |
105 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | |
106 # subinstance.readDict.setdefault(key, []) | |
107 subinstance.gene=gff_name | |
108 GFFinstanceDict[sample][gff_name]=subinstance | |
109 return GFFinstanceDict | |
110 | |
111 MasterListOfGenomes=process_samples(filePath) | |
112 | |
113 if args.gff: | |
114 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | |
115 | |
116 if global_size: | |
117 write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol) | |
118 else: | |
119 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file, pol) |