comparison pi_database_splitter.py @ 0:34c5c95740a1 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit a58e2a324724f344a07d4499c860a5b2da06927d
author galaxyp
date Mon, 22 May 2017 05:08:23 -0400
parents
children 8a30d6e5b97d
comparison
equal deleted inserted replaced
-1:000000000000 0:34c5c95740a1
1 #!/usr/bin/env python
2 import sys
3 import argparse
4 from numpy import median
5 from contextlib import ExitStack
6
7
8 def main():
9 if sys.argv[1:] == []:
10 sys.argv.append('-h')
11 args = parse_commandline()
12 locfun = {False: locatefraction,
13 True: reverse_locatefraction}[args.reverse]
14 # Column nrs should start from 0
15 # If negative, -1 is last item in list, etc
16 if args.fdrcol > 0:
17 args.fdrcol -= 1
18 if args.deltapicol > 0:
19 args.deltapicol -= 1
20 pishift = get_pishift(args.train_peptable, args.fdrcol, args.deltapicol,
21 args.fdrcutoff, args.picutoff)
22 binarray = get_bin_array(args.fr_amount, args.fr_width, args.intercept,
23 args.tolerance, pishift)
24 write_fractions(args.pipeps, args.fr_amount, args.prefix,
25 binarray, locfun, args.minlen, args.maxlen)
26
27
28 def locatefraction(pep_pi, bins):
29 index = []
30 for pibin in bins:
31 if pep_pi > pibin[2]:
32 continue
33 elif pep_pi >= pibin[1]:
34 index.append(pibin[0])
35 else:
36 return index
37 return index
38
39
40 def reverse_locatefraction(pep_pi, bins):
41 index = []
42 for pibin in bins:
43 if pep_pi < pibin[1]:
44 continue
45 elif pep_pi < pibin[2]:
46 index.append(pibin[0])
47 else:
48 return index
49 return index
50
51
52 def parse_commandline():
53 parser = argparse.ArgumentParser(
54 formatter_class=argparse.RawTextHelpFormatter)
55 parser.add_argument('-p', dest='train_peptable', help='Peptide table with '
56 'peptides, FDR, and fraction numbers. Used to '
57 'calculate pI shift. Leave emtpy for no shift. '
58 'Tab separated file.')
59 parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column'
60 ' number in peptide table. First column is nr. 1. '
61 'Negative number for counting from last col '
62 '(-1 is last).', default=False, type=int)
63 parser.add_argument('--picutoff', dest='picutoff',
64 help='delta pI value to filter experimental peptides'
65 ' when calculating pi shift.', default=0.2, type=float)
66 parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in '
67 'peptide table. First column is nr. 1. Empty includes '
68 'all peptides', default=False, type=int)
69 parser.add_argument('--fdrcutoff', dest='fdrcutoff',
70 help='FDR cutoff value to filter experimental peptides'
71 ' when calculating pi shift.', default=0, type=float)
72 parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file '
73 'with accession, peptide seq, pI value')
74 parser.add_argument('--prefix', dest='prefix', default='pisep',
75 help='Prefix for target/decoy output files')
76 parser.add_argument('--tolerance', dest='tolerance',
77 help='Strip fraction tolerance pi tolerance represents'
78 ' 2.5/97.5 percentile', type=float)
79 parser.add_argument('--amount', dest='fr_amount',
80 help='Strip fraction amount', type=int)
81 parser.add_argument('--reverse', dest='reverse', help='Strip is reversed',
82 action='store_const', const=True, default=False)
83 parser.add_argument('--intercept', dest='intercept',
84 help='pI Intercept of strip', type=float)
85 parser.add_argument('--width', dest='fr_width',
86 help='Strip fraction width in pI', type=float)
87 parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length',
88 type=int)
89 parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length',
90 type=int, default=False)
91 return parser.parse_args(sys.argv[1:])
92
93
94 def get_pishift(peptable, fdrcol, deltapicol, fdrcutoff, delta_pi_cutoff):
95 delta_pis = []
96 with open(peptable) as fp:
97 next(fp) # skip header
98 for line in fp:
99 line = line.strip('\n').split('\t')
100 if fdrcol:
101 try:
102 fdr = float(line[fdrcol])
103 except ValueError:
104 continue
105 if fdr > fdrcutoff:
106 continue
107 try:
108 delta_pi = float(line[deltapicol])
109 except ValueError:
110 continue
111 if delta_pi < delta_pi_cutoff:
112 delta_pis.append(delta_pi)
113 shift = median(delta_pis)
114 print('pI shift (median of delta pIs): {}'.format(shift))
115 return shift
116
117
118 def get_bin_array(amount_fractions, fr_width, intercept, tolerance, pi_shift):
119 frnr = 1
120 bin_array = []
121 while frnr <= amount_fractions:
122 pi_center = fr_width * frnr + intercept
123 bin_left = pi_center - fr_width / 2 - tolerance - pi_shift
124 bin_right = pi_center + fr_width / 2 + tolerance - pi_shift
125 print('Bins in fraction', frnr, bin_left, bin_right)
126 bin_array.append((frnr, bin_left, bin_right))
127 frnr += 1
128 return bin_array
129
130
131 def write_fractions(pi_peptides_fn, amount_fractions, out_prefix,
132 bin_array, locate_function, minlen, maxlen):
133 amountpad = len(str(amount_fractions))
134 with ExitStack() as stack:
135 target_out_fp = {frnr: ([], stack.enter_context(
136 open('{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr,
137 pad=amountpad), 'w')))
138 for frnr in range(1, amount_fractions + 1)}
139 decoy_out_fp = {frnr: ([], stack.enter_context(
140 open('decoy_{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr,
141 pad=amountpad), 'w')))
142 for frnr in range(1, amount_fractions + 1)}
143 input_fp = stack.enter_context(open(pi_peptides_fn))
144 pepcount = 0
145 for line in input_fp:
146 accs, pep, pi = line.strip().split("\t")
147 pi = float(pi)
148 if maxlen and len(pep) > maxlen:
149 continue
150 elif len(pep) >= minlen:
151 pepcount += 1
152 if pep[-1] in {'K', 'R'}:
153 rev_pep = pep[::-1][1:] + pep[-1]
154 else:
155 rev_pep = pep[::-1]
156 for i in locate_function(pi, bin_array):
157 target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep))
158 # write pseudoReversed decoy peptide at the same time
159 decoy_out_fp[i][0].append('>decoy_{}\n{}\n'.format(
160 accs, rev_pep))
161 if pepcount > 1000000:
162 # write in chunks to make it go faster
163 pepcount = 0
164 [fp.write(''.join(peps)) for peps, fp in
165 target_out_fp.values()]
166 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()]
167 target_out_fp = {fr: ([], pep_fp[1])
168 for fr, pep_fp in target_out_fp.items()}
169 decoy_out_fp = {fr: ([], pep_fp[1])
170 for fr, pep_fp in decoy_out_fp.items()}
171 [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()]
172 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()]
173
174
175 if __name__ == '__main__':
176 main()