Mercurial > repos > mvdbeek > damid_deseq2_to_peaks
annotate damid_deseq2_to_peaks.py @ 0:3fd7995da4fd draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
author | mvdbeek |
---|---|
date | Mon, 07 Jan 2019 12:58:55 -0500 |
parents | |
children | edca422b6cd6 |
rev | line source |
---|---|
0
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
1 import click |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
2 import pandas as pd |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
3 import numpy as np |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
4 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
5 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
6 def order_index(df): |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
7 """ |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
8 Split chr_start_stop in df index and order by chrom and start. |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
9 """ |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
10 idx = df.index.str.split('_') |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
11 idx = pd.DataFrame.from_records(list(idx)) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
12 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
13 idx.columns = ['chr', 'start', 'stop'] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
14 idx = idx.astype(dtype={"chr": "object", |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
15 "start": "int32", |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
16 "stop": "int32"}) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
17 coordinates = idx.sort_values(['chr', 'start']) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
18 df.index = np.arange(len(df.index)) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
19 df = df.loc[coordinates.index] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
20 df = coordinates.join(df) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
21 # index is center of GATC site |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
22 df.index = df['start'] + 2 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
23 return df |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
24 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
25 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
26 def significant_gatcs_to_peaks(df, p_value_cutoff): |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
27 # Add `pass` column for sig. GATCs |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
28 df['pass'] = 0 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
29 df.loc[df[6] < p_value_cutoff, 'pass'] = 1 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
30 # Create pass_id column for consecutive pass or no-pass GATCs |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
31 # True whenever there is a value change (from previous value): |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
32 df['pass_id'] = df.groupby('chr')['pass'].diff().ne(0).cumsum() |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
33 gb = df.groupby('pass_id') |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
34 # aggregate |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
35 consecutive_gatcs = gb.aggregate({'chr': np.min, 'start': np.min, 'stop': np.max, 'pass': np.max}) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
36 # keep only groups with 2 or more GATCS |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
37 s = gb.size() > 1 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
38 consecutive_only = consecutive_gatcs[s] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
39 # drop GATC groups that are not significant |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
40 peaks = consecutive_only[consecutive_only['pass'] == 1][['chr', 'start', 'stop']] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
41 # calculate region that is not covered. |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
42 no_peaks = consecutive_only[consecutive_only['pass'] == 0][['chr', 'start', 'stop']] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
43 s = no_peaks['stop'] - no_peaks['start'] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
44 print("%s nt not covered by peaks" % s.sum()) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
45 s = peaks['stop'] - peaks['start'] |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
46 print("%s nt covered by peaks" % s.sum()) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
47 return peaks |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
48 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
49 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
50 @click.command() |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
51 @click.argument('input_path', type=click.Path(exists=True)) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
52 @click.argument('output_path', type=click.Path()) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
53 @click.option('--p_value_cutoff', type=float, default=0.01, help="Minimum adjusted p-value for a significant GATC site") |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
54 def deseq2_gatc_to_peak(input_path, output_path, p_value_cutoff): |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
55 df = pd.read_csv(input_path, sep='\t', header=None, index_col=0) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
56 df = order_index(df) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
57 peaks = significant_gatcs_to_peaks(df, p_value_cutoff) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
58 peaks.to_csv(output_path, sep='\t', header=None, index=None) |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
59 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
60 |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
61 if __name__ == '__main__': |
3fd7995da4fd
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55
mvdbeek
parents:
diff
changeset
|
62 deseq2_gatc_to_peak() |