annotate damid_deseq2_to_peaks.py @ 0:3fd7995da4fd draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
author mvdbeek
date Mon, 07 Jan 2019 12:58:55 -0500
parents
children edca422b6cd6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
1 import click
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
2 import pandas as pd
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
3 import numpy as np
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
4
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
5
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
6 def order_index(df):
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
7 """
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
8 Split chr_start_stop in df index and order by chrom and start.
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
9 """
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
10 idx = df.index.str.split('_')
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
11 idx = pd.DataFrame.from_records(list(idx))
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
12
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
13 idx.columns = ['chr', 'start', 'stop']
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
14 idx = idx.astype(dtype={"chr": "object",
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
15 "start": "int32",
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
16 "stop": "int32"})
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
17 coordinates = idx.sort_values(['chr', 'start'])
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
18 df.index = np.arange(len(df.index))
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
19 df = df.loc[coordinates.index]
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
20 df = coordinates.join(df)
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
21 # index is center of GATC site
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
22 df.index = df['start'] + 2
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
23 return df
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
24
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
25
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
26 def significant_gatcs_to_peaks(df, p_value_cutoff):
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
27 # Add `pass` column for sig. GATCs
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
28 df['pass'] = 0
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
29 df.loc[df[6] < p_value_cutoff, 'pass'] = 1
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
30 # Create pass_id column for consecutive pass or no-pass GATCs
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
31 # True whenever there is a value change (from previous value):
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
32 df['pass_id'] = df.groupby('chr')['pass'].diff().ne(0).cumsum()
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
33 gb = df.groupby('pass_id')
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
34 # aggregate
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
35 consecutive_gatcs = gb.aggregate({'chr': np.min, 'start': np.min, 'stop': np.max, 'pass': np.max})
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
36 # keep only groups with 2 or more GATCS
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
37 s = gb.size() > 1
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
38 consecutive_only = consecutive_gatcs[s]
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
39 # drop GATC groups that are not significant
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
40 peaks = consecutive_only[consecutive_only['pass'] == 1][['chr', 'start', 'stop']]
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
41 # calculate region that is not covered.
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
42 no_peaks = consecutive_only[consecutive_only['pass'] == 0][['chr', 'start', 'stop']]
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
43 s = no_peaks['stop'] - no_peaks['start']
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
44 print("%s nt not covered by peaks" % s.sum())
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
45 s = peaks['stop'] - peaks['start']
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
46 print("%s nt covered by peaks" % s.sum())
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
47 return peaks
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
48
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
49
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
50 @click.command()
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
51 @click.argument('input_path', type=click.Path(exists=True))
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
52 @click.argument('output_path', type=click.Path())
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
53 @click.option('--p_value_cutoff', type=float, default=0.01, help="Minimum adjusted p-value for a significant GATC site")
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
54 def deseq2_gatc_to_peak(input_path, output_path, p_value_cutoff):
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
55 df = pd.read_csv(input_path, sep='\t', header=None, index_col=0)
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
56 df = order_index(df)
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
57 peaks = significant_gatcs_to_peaks(df, p_value_cutoff)
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
58 peaks.to_csv(output_path, sep='\t', header=None, index=None)
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
59
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
60
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
61 if __name__ == '__main__':
3fd7995da4fd planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff changeset
62 deseq2_gatc_to_peak()