Mercurial > repos > galaxyp > msms_extractor
comparison msms_extractor.py @ 0:4bef80b09854 draft default tip
"planemo upload commit 498440b547e1feaa6a81764b55ac8208626d70a8"
author | galaxyp |
---|---|
date | Tue, 29 Oct 2019 11:09:46 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4bef80b09854 |
---|---|
1 # | |
2 # Developed by Praveen Kumar | |
3 # Galaxy-P Team (Griffin's Lab) | |
4 # University of Minnesota | |
5 # | |
6 # | |
7 # | |
8 | |
9 | |
10 from pyteomics import mzml | |
11 import os | |
12 import sys | |
13 import shutil | |
14 import subprocess | |
15 import re | |
16 import pandas as pd | |
17 from operator import itemgetter | |
18 from itertools import groupby | |
19 import random | |
20 import argparse | |
21 | |
22 def main(): | |
23 if len(sys.argv) >= 7: | |
24 parser = argparse.ArgumentParser() | |
25 parser.add_argument("msms", help="mzML File") | |
26 parser.add_argument("psm", help="PSM Report File") | |
27 parser.add_argument("out", help="Output filename") | |
28 parser.add_argument("filestring", help="MSMS File string as identifier") | |
29 parser.add_argument("remove_retain", help="Remove scans reported in the PSM report") | |
30 parser.add_argument("random", help="Random MSMS scans used with to use with --retain (default=0)", default=0) | |
31 args = parser.parse_args() | |
32 # Start of Reading Scans from PSM file | |
33 # Creating dictionary of PSM file: key = filename key = list of scan numbers | |
34 | |
35 # removeORretain = sys.argv[5].strip() | |
36 # randomScans = int(sys.argv[6].strip()) | |
37 | |
38 removeORretain = args.remove_retain | |
39 randomScans = int(args.random) | |
40 | |
41 | |
42 # ScanFile = sys.argv[2] | |
43 ScanFile = args.psm | |
44 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) | |
45 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] | |
46 scanDict = {} | |
47 for each in scanFileNumber: | |
48 if each[0] in scanDict.keys(): | |
49 scanDict[each[0]].append(int(each[1])) | |
50 else: | |
51 scanDict[each[0]] = [int(each[1])] | |
52 # End of Reading Scans from PSM file | |
53 | |
54 # inputPath = sys.argv[1] | |
55 inputPath = args.msms | |
56 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) | |
57 # outPath = sys.argv[3] | |
58 outPath = args.out | |
59 ##outFile = sys.argv[3].split("/")[-1] | |
60 allScanList = [] | |
61 # Read all scan numbers using indexedmzML/indexList/index/offset tags | |
62 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): | |
63 if re.search("scan=(\d+)", k['idRef']): | |
64 a = re.search("scan=(\d+)", k['idRef']) | |
65 allScanList.append(int(a.group(1))) | |
66 allScanList = list(set(allScanList)) | |
67 # End of Reading mzML file | |
68 # fraction_name = sys.argv[4] | |
69 fraction_name = args.filestring | |
70 if fraction_name in scanDict.keys(): | |
71 scansInList = scanDict[fraction_name] | |
72 else: | |
73 scansInList = [] | |
74 scansNotInList = list(set(allScanList) - set(scansInList)) | |
75 flag = 0 | |
76 if removeORretain == "remove": | |
77 scan2retain = scansNotInList | |
78 scan2retain = list(set(scan2retain)) | |
79 scan2retain.sort() | |
80 scansRemoved = scansInList | |
81 # scan2retain contains scans that is to be retained | |
82 elif removeORretain == "retain" and randomScans < len(scansNotInList): | |
83 # Randomly select spectra | |
84 random_scans = random.sample(scansNotInList, randomScans) | |
85 | |
86 scan2retain = random_scans + scansInList | |
87 scan2retain = list(set(scan2retain)) | |
88 scan2retain.sort() | |
89 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
90 # scan2retain contains scans that is to be retained | |
91 else: | |
92 flag = 1 | |
93 | |
94 if flag == 1: | |
95 scan2retain = scansInList | |
96 scan2retain = list(set(scan2retain)) | |
97 scan2retain.sort() | |
98 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
99 | |
100 # scan2retain contains scans that is to be retained | |
101 print("ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans.", file=sys.stdout) | |
102 print("Number of available scans for random selection: " + str(len(scansNotInList)), file=sys.stdout) | |
103 print("Try a number less than the available number. Thanks!!", file=sys.stdout) | |
104 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout) | |
105 else: | |
106 # Print Stats | |
107 print("Total number of Scan Numbers: " + str(len(list(set(allScanList)))), file=sys.stdout) | |
108 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout) | |
109 print("Number of Scans removed: " + str(len(scansRemoved)), file=sys.stdout) | |
110 | |
111 | |
112 # Identifying groups of continuous numbers in the scan2retain and creating scanString | |
113 scanString = "" | |
114 for a, b in groupby(enumerate(scan2retain), lambda x:x[1]-x[0]): | |
115 x = list(map(itemgetter(1), b)) | |
116 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] " | |
117 # end identifying | |
118 # start create filter file | |
119 filter_file = open("filter.txt", "w") | |
120 filter_file.write("filter=scanNumber %s\n" % scanString) | |
121 filter_file.close() | |
122 # end create filter file | |
123 | |
124 # Prepare command for msconvert | |
125 inputFile = fraction_name+".mzML" | |
126 os.symlink(inputPath,inputFile) | |
127 outFile = "filtered_"+fraction_name+".mzML" | |
128 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib" | |
129 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib" | |
130 | |
131 | |
132 # Run msconvert | |
133 try: | |
134 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True) | |
135 except subprocess.CalledProcessError as e: | |
136 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output )) | |
137 sys.exit(e.returncode) | |
138 # Copy output to | |
139 shutil.copyfile(outFile, outPath) | |
140 else: | |
141 print("Please contact the admin. Number of inputs are not sufficient to run the program.\n") | |
142 | |
143 if __name__ == "__main__": | |
144 main() | |
145 | |
146 | |
147 | |
148 |