Mercurial > repos > pravs > msms_extractor
comparison MSMS_Extractor.py @ 0:093015b1b904 draft
planemo upload
author | pravs |
---|---|
date | Mon, 13 Feb 2017 02:44:05 -0500 |
parents | |
children | aa944e3a353c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:093015b1b904 |
---|---|
1 # | |
2 # Developed by Praveen Kumar | |
3 # Galaxy-P Team (Griffin's Lab) | |
4 # University of Minnesota | |
5 # | |
6 # | |
7 | |
8 def main(): | |
9 from pyteomics import mzml | |
10 import os | |
11 import sys | |
12 import shutil | |
13 import subprocess | |
14 import re | |
15 import pandas as pd | |
16 from operator import itemgetter | |
17 from itertools import groupby | |
18 if len(sys.argv) >= 5: | |
19 # Start of Reading Scans from PSM file | |
20 # Creating dictionary of PSM file: key = filename key = list of scan numbers | |
21 ScanFile = sys.argv[2] | |
22 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) | |
23 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] | |
24 scanDict = {} | |
25 for each in scanFileNumber: | |
26 if scanDict.has_key(each[0]): | |
27 scanDict[each[0]].append(int(each[1])) | |
28 else: | |
29 scanDict[each[0]] = [int(each[1])] | |
30 # End of Reading Scans from PSM file | |
31 | |
32 inputPath = sys.argv[1] | |
33 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) | |
34 outPath = sys.argv[3] | |
35 ##outFile = sys.argv[3].split("/")[-1] | |
36 allScanList = [] | |
37 | |
38 # Read all scan numbers using indexedmzML/indexList/index/offset tags | |
39 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): | |
40 if re.search("scan=(\d+)", k['idRef']): | |
41 a = re.search("scan=(\d+)", k['idRef']) | |
42 allScanList.append(int(a.group(1))) | |
43 # End of Reading mzML file | |
44 | |
45 fraction_name = sys.argv[4] | |
46 if scanDict.has_key(fraction_name): | |
47 scan2remove = scanDict[fraction_name] | |
48 else: | |
49 scan2remove = [] | |
50 scan2retain = list(set(allScanList) - set(scan2remove)) | |
51 scan2retain.sort() | |
52 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
53 # scan2retain contains scans that is to be retained | |
54 | |
55 # Print Stats | |
56 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) | |
57 print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove))) | |
58 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) | |
59 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) | |
60 | |
61 | |
62 # Identifying groups of continuous numbers in the scan2retain and creating scanString | |
63 scanString = "" | |
64 for a, b in groupby(enumerate(scan2retain), lambda(i,x):i-x): | |
65 x = map(itemgetter(1), b) | |
66 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] " | |
67 # end identifying | |
68 | |
69 # start create filter file | |
70 filter_file = open("filter.txt", "w") | |
71 filter_file.write("filter=scanNumber %s\n" % scanString) | |
72 filter_file.close() | |
73 # end create filter file | |
74 | |
75 # Prepare command for msconvert | |
76 inputFile = fraction_name+".mzML" | |
77 os.symlink(inputPath,inputFile) | |
78 outFile = "filtered_"+fraction_name+".mzML" | |
79 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib" | |
80 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib" | |
81 | |
82 | |
83 # Run msconvert | |
84 try: | |
85 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True) | |
86 except subprocess.CalledProcessError as e: | |
87 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output )) | |
88 sys.exit(e.returncode) | |
89 # Copy output to | |
90 shutil.copyfile(outFile, outPath) | |
91 | |
92 else: | |
93 print "Please contact the admin. Number of inputs are not sufficient to run the program.\n" | |
94 | |
95 if __name__ == "__main__": | |
96 main() | |
97 | |
98 | |
99 | |
100 |