Mercurial > repos > cpt > cpt_disruptin_finder
comparison disruptin_finder.py @ 1:b973bc75693d draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:49 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3f0d07a10405 | 1:b973bc75693d |
---|---|
1 """ | |
2 This program is intended to find gene products that would be acceptable disruptin candidates. | |
3 | |
4 The criteria can be toggled between selecting for proteins with: | |
5 - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa) | |
6 OR | |
7 - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa) | |
8 and length less than given threshold (default = 100 aa) | |
9 OR | |
10 - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence | |
11 above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa) | |
12 | |
13 Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every | |
14 D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner. | |
15 The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then | |
16 divided by the length to get the ratio. | |
17 | |
18 Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold | |
19 sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file. | |
20 The output fasta file includes records for all the sequences meeting the size and charge criteria. | |
21 | |
22 """ | |
23 | |
24 from Bio import SeqIO | |
25 import argparse | |
26 import sys | |
27 | |
28 | |
29 def disruptin_finder( | |
30 fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria | |
31 ): | |
32 # Iterable variables | |
33 net_charge = 0 | |
34 charge_res = 0 | |
35 | |
36 # Create record variable to store record information | |
37 total_record = [] | |
38 | |
39 # Parse the .fasta file and get the sequence | |
40 for rec in SeqIO.parse(fasta_file, "fasta"): | |
41 sequence = str(rec.seq) | |
42 | |
43 if len(sequence) <= thresh_size: | |
44 for aa in sequence: | |
45 # For R and K residues a positive charge is given | |
46 if aa in "RK": | |
47 net_charge += 1 | |
48 charge_res += 1 | |
49 # For D and E residues a negative charge is given | |
50 elif aa in "DE": | |
51 net_charge -= 1 | |
52 charge_res += 1 | |
53 | |
54 # Charge (total charged residues) to size ratio is calculated | |
55 Length = len(sequence) | |
56 charge_ratio = float(charge_res) / float(Length) | |
57 | |
58 # Based on the user-specified selection criteria a list of records is compiled | |
59 if selection_criteria == "net": | |
60 if net_charge >= thresh_net_charge: | |
61 total_record = total_record + [rec] | |
62 elif selection_criteria == "ratio": | |
63 if charge_ratio >= thresh_charge_ratio: | |
64 total_record = total_record + [rec] | |
65 elif selection_criteria == "both": | |
66 if ( | |
67 charge_ratio >= thresh_charge_ratio | |
68 and net_charge >= thresh_net_charge | |
69 ): | |
70 total_record = total_record + [rec] | |
71 | |
72 # Reset the iterable variables | |
73 net_charge = 0 | |
74 charge_res = 0 | |
75 | |
76 # The total list of records is returned by the function | |
77 yield total_record | |
78 | |
79 | |
80 if __name__ == "__main__": | |
81 # Grab all of the filters from our plugin loader | |
82 parser = argparse.ArgumentParser(description="Disruptin Finder") | |
83 parser.add_argument( | |
84 "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input" | |
85 ) | |
86 parser.add_argument("--thresh_net_charge", type=int, default=4) | |
87 parser.add_argument("--thresh_size", type=int, default=100) | |
88 parser.add_argument("--thresh_charge_ratio", type=float, default=0.25) | |
89 parser.add_argument("--selection_criteria", action="store") | |
90 args = parser.parse_args() | |
91 | |
92 for seq in disruptin_finder(**vars(args)): | |
93 SeqIO.write(seq, sys.stdout, "fasta") |