Mercurial > repos > pravs > msms_extractor
comparison MSMS_Extractor.py @ 2:aa944e3a353c draft
planemo upload
author | pravs |
---|---|
date | Thu, 03 Aug 2017 13:53:09 -0400 |
parents | 093015b1b904 |
children | c2f8e3164537 |
comparison
equal
deleted
inserted
replaced
1:f444e529363d | 2:aa944e3a353c |
---|---|
1 # | 1 # |
2 # Developed by Praveen Kumar | 2 # Developed by Praveen Kumar |
3 # Galaxy-P Team (Griffin's Lab) | 3 # Galaxy-P Team (Griffin's Lab) |
4 # University of Minnesota | 4 # University of Minnesota |
5 # | |
5 # | 6 # |
6 # | 7 # |
7 | 8 |
8 def main(): | 9 def main(): |
9 from pyteomics import mzml | 10 from pyteomics import mzml |
13 import subprocess | 14 import subprocess |
14 import re | 15 import re |
15 import pandas as pd | 16 import pandas as pd |
16 from operator import itemgetter | 17 from operator import itemgetter |
17 from itertools import groupby | 18 from itertools import groupby |
18 if len(sys.argv) >= 5: | 19 import random |
20 | |
21 if len(sys.argv) >= 7: | |
19 # Start of Reading Scans from PSM file | 22 # Start of Reading Scans from PSM file |
20 # Creating dictionary of PSM file: key = filename key = list of scan numbers | 23 # Creating dictionary of PSM file: key = filename key = list of scan numbers |
24 | |
25 removeORretain = sys.argv[5].strip() | |
26 randomScans = int(sys.argv[6].strip()) | |
27 | |
21 ScanFile = sys.argv[2] | 28 ScanFile = sys.argv[2] |
22 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) | 29 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) |
23 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] | 30 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] |
24 scanDict = {} | 31 scanDict = {} |
25 for each in scanFileNumber: | 32 for each in scanFileNumber: |
32 inputPath = sys.argv[1] | 39 inputPath = sys.argv[1] |
33 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) | 40 ##outPath = "/".join(sys.argv[3].split("/")[:-1]) |
34 outPath = sys.argv[3] | 41 outPath = sys.argv[3] |
35 ##outFile = sys.argv[3].split("/")[-1] | 42 ##outFile = sys.argv[3].split("/")[-1] |
36 allScanList = [] | 43 allScanList = [] |
37 | |
38 # Read all scan numbers using indexedmzML/indexList/index/offset tags | 44 # Read all scan numbers using indexedmzML/indexList/index/offset tags |
39 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): | 45 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): |
40 if re.search("scan=(\d+)", k['idRef']): | 46 if re.search("scan=(\d+)", k['idRef']): |
41 a = re.search("scan=(\d+)", k['idRef']) | 47 a = re.search("scan=(\d+)", k['idRef']) |
42 allScanList.append(int(a.group(1))) | 48 allScanList.append(int(a.group(1))) |
43 # End of Reading mzML file | 49 # End of Reading mzML file |
44 | 50 |
45 fraction_name = sys.argv[4] | 51 fraction_name = sys.argv[4] |
46 if scanDict.has_key(fraction_name): | 52 if scanDict.has_key(fraction_name): |
47 scan2remove = scanDict[fraction_name] | 53 scansInList = scanDict[fraction_name] |
48 else: | 54 else: |
49 scan2remove = [] | 55 scansInList = [] |
50 scan2retain = list(set(allScanList) - set(scan2remove)) | 56 scansNotInList = list(set(allScanList) - set(scansInList)) |
51 scan2retain.sort() | |
52 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
53 # scan2retain contains scans that is to be retained | |
54 | 57 |
58 if removeORretain == "remove": | |
59 scan2retain = scansNotInList | |
60 scan2retain.sort() | |
61 scansRemoved = scansInList | |
62 # scan2retain contains scans that is to be retained | |
63 | |
64 elif removeORretain == "retain": | |
65 # Randomly select spectra | |
66 random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans))) | |
67 | |
68 scan2retain = random_scans + scansInList | |
69 scan2retain.sort() | |
70 scansRemoved = list(set(allScanList) - set(scan2retain)) | |
71 # scan2retain contains scans that is to be retained | |
72 | |
55 # Print Stats | 73 # Print Stats |
56 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) | 74 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) |
57 print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove))) | |
58 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) | 75 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) |
59 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) | 76 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved) |
60 | 77 |
61 | 78 |
62 # Identifying groups of continuous numbers in the scan2retain and creating scanString | 79 # Identifying groups of continuous numbers in the scan2retain and creating scanString |