Mercurial > repos > cpt > cpt_disruptin_table
view Disruptin_hydrophobicity_helicity_table_package.py @ 1:a99be535e99d draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:41:05 +0000 |
parents | |
children |
line wrap: on
line source
""" This program is intended to create the output table for the disruptin finder workflow """ from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData import csv import argparse import sys def disruptin_table(garnier_file, fasta_file): # Iterable variables position = 1 net_charge = 0 charge_res = 0 record_number = 0 # loop structures names = [] sec_struct = [] # reading the lines from the garnier csv file # with open(garnier_file,'r') as csvfile: # garnierreader = csv.reader(csvfile) for row in garnier_file: if row[0] == "Sequence: ": names += [row[1]] elif row[0] in "HETC": row = row.split("\t") sec_struct += ["".join(row)] record = [] p = [] r = [] c = [] h = [] s = [] # Parse the .fasta file and get the sequence for rec in SeqIO.parse(fasta_file, "fasta"): sequence = str(rec.seq) # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary # structure record += [rec.id] position_vec = [] residue_vec = [] charge_sym_vec = [] sec_struct_vec = [] for aa in sequence: position_vec += [str(position)] residue_vec += [str(aa)] sec_struct_vec += [str(sec_struct[record_number][position - 1])] # For R and K residues a positive charge is given if aa in "RK": symbol = "+" # For D and E residues a negative charge is given elif aa in "DE": symbol = "-" elif aa in "AVMILPWFG": symbol = "N" elif aa in "HSYTCQN": symbol = "P" charge_sym_vec += symbol position += 1 # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their # approximate position on the sequence. prot_ana_seq = ProteinAnalysis(sequence) hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4 record_number += 1 position = 1 p += [position_vec] r += [residue_vec] c += [charge_sym_vec] h += [hydro] s += [sec_struct_vec] # returns values for name of the sequence return record, p, r, c, h, s if __name__ == "__main__": # Grab all of the filters from our plugin loader parser = argparse.ArgumentParser(description="Disruptin Table Output") parser.add_argument( "garnier_file", type=argparse.FileType("r"), help="csv file from garnier reader" ) parser.add_argument( "fasta_file", type=argparse.FileType("r"), help="fasta file of disruptin candidates", ) args = parser.parse_args() # Set up output location # f = open(sys.stdout, 'w', newline='') # writer1 = csv.writer(f) iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args)) for i in range(len(iden)): # writer1.writerow(['Protein ID']+[iden[i]]) # writer1.writerow(['Position'] + [format(x, 's') for x in position[i]]) # writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]]) # writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]]) # writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]]) # writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]]) # writer1.writerow(['']) print(str(iden[i])) print("Position \t " + "\t".join(position[i])) print("Residue \t" + "\t".join(residue[i])) print("Charge \t" + "\t".join(charge[i])) print("Hydrophobicity \t" + "\t".join(format(x, ".3f") for x in hydro[i])) print("Secondary Structure \t" + "\t".join(struct[i]))