0
|
1 """
|
|
2 Python-code: Merge Scaffold Samples Report files
|
|
3 @author = Brent Kuenzi
|
|
4 @email = Brent.Kuenzi@moffitt.org
|
|
5 """
|
|
6 #######################################################################################
|
|
7 import sys
|
|
8 import urllib2
|
|
9 import os.path
|
6
|
10 import pandas
|
0
|
11 #######################################################################################
|
|
12 ## Description: ##
|
|
13 # This program will merge either 2 or 3 scaffold
|
|
14 # sample report files together
|
|
15 ## Required input: ##
|
|
16 infile1 = sys.argv[1] # scaffold report #1 -- filename
|
|
17 infile2 = sys.argv[2] # scaffold report #2 -- filename
|
|
18 infile3 = sys.argv[3] # scaffold report #3 -- filename or "False"
|
|
19 baitfile = sys.argv[4] # Bait file -- filename
|
|
20 outfile = sys.argv[5] # output filename
|
|
21 class ScaffoldReturn(object):
|
|
22 def __init__(self, getdata, getproteins, getheader):
|
|
23 self.data = getdata
|
|
24 self.proteins = getproteins
|
|
25 self.header = getheader
|
|
26 class ProteinInfo(object):
|
|
27 def __init__(self, getMW, getDescr):
|
|
28 self.mw = getMW
|
|
29 self.descr = getDescr
|
|
30 def readtab(infile):
|
|
31 with open(infile,'r') as x: # read in tab-delim text
|
|
32 output = []
|
|
33 for line in x:
|
|
34 line = line.strip()
|
|
35 temp = line.split('\t')
|
|
36 output.append(temp)
|
|
37 return output
|
|
38 def read_scaffold(scaffold_input): # Get data, proteins and header from scaffold output
|
|
39 dupes = readtab(scaffold_input)
|
|
40 cnt = 0
|
|
41 for i in dupes:
|
|
42 if "Accession Number" in i: # finds the start of header
|
|
43 header_start = cnt
|
|
44 break
|
|
45 cnt += 1
|
|
46 header = dupes[header_start]
|
|
47 prot_start = header.index("Accession Number")
|
|
48 data = dupes[header_start+1:len(dupes)-2] # cut off blank line and END OF FILE
|
|
49 proteins = []
|
|
50 for i in data:
|
|
51 i[prot_start] = i[prot_start].split()[0] # removes the (+##) that sometimes is attached
|
|
52 for protein in data:
|
|
53 proteins.append(protein[prot_start])
|
|
54 return ScaffoldReturn(data, proteins, header)
|
|
55 def MakeDF(scaffold_input,bait_input):
|
|
56 bait = readtab(bait_input)
|
|
57 data = read_scaffold(scaffold_input).data
|
|
58 header = read_scaffold(scaffold_input).header
|
|
59 proteins = read_scaffold(scaffold_input).proteins
|
|
60 prot_start = header.index("Accession Number")
|
|
61 bait_index = []
|
|
62 ind = []
|
|
63 for i in bait:
|
|
64 if i[0] in header:
|
|
65 bait_index.append(header.index(i[0])) # Find just the baits defined in bait file
|
|
66 ind.append(i[0])
|
|
67 frames = {}
|
|
68 for i in proteins:
|
|
69 protein = i.split()[0]
|
|
70 if protein not in frames:
|
|
71 frames[protein] = [] # create dictionary of proteins for each bait value
|
|
72 for i in data:
|
|
73 temp = i[prot_start]
|
|
74 protein = temp.split()[0]
|
|
75 for j in bait_index: # create dataframe
|
|
76 frames[protein].append(i[j])
|
|
77 df = pd.DataFrame(frames,index=ind)
|
|
78 return df
|
|
79 def get_info(input1,input2,input3):
|
|
80 files = [input1,input2]
|
|
81 molwt = {}
|
|
82 protdesc = {}
|
|
83 if input3 != "False":
|
|
84 files.append(input3)
|
|
85 for i in files:
|
|
86 data = read_scaffold(i).data
|
|
87 header = read_scaffold(i).header
|
|
88 prot_start = header.index("Accession Number") # find header
|
|
89 prot = 0
|
|
90 mw = 0
|
|
91 cnt=0
|
|
92 for j in header: # find info
|
|
93 if "Identified" in j:
|
|
94 prot=cnt
|
|
95 if "Molecular Weight" in j:
|
|
96 mw=cnt
|
|
97 cnt+=1
|
|
98 for k in data: # append info
|
|
99 temp = k[prot_start]
|
|
100 protein = temp.split()[0]
|
|
101 if protein not in protdesc:
|
|
102 protdesc[protein] = k[prot]
|
|
103 molwt[protein] = k[mw]
|
|
104 mw = pd.DataFrame(molwt,index=["Molecular Weight"])
|
|
105 descr = pd.DataFrame(protdesc,index=["Identified Proteins"])
|
|
106 return ProteinInfo(mw,descr) # return info as dataframe
|
|
107 def MakeFile(input1,input2,input3,bait_input):
|
|
108 files = [input1,input2]
|
|
109 if input3 != "False":
|
|
110 files.append(input3)
|
|
111 DFs = []
|
|
112 DFs.append(get_info(input1,input2,input3).descr)
|
|
113 DFs.append(get_info(input1,input2,input3).mw)
|
|
114 for i in files:
|
|
115 DFs.append(MakeDF(i,bait_input))
|
|
116 final_df = pd.concat(DFs)
|
|
117 temp = final_df.T
|
|
118 temp.index.name = "Accession Number"
|
|
119 output = temp.fillna(0)
|
|
120 output["Accession Number"] = output.index.get_values()
|
|
121 output.index = range(1, len(output.index) + 1)
|
|
122 output.index.name = "#"
|
|
123 output.to_csv(outfile,sep="\t")
|
|
124
|
|
125 #######################################################################################
|
|
126 MakeFile(input1 = infile1,input2= infile2,input3 = infile3,bait_input=baitfile, outfile="merged.txt")
|
|
127 #######################################################################################
|