Mercurial > repos > pravs > msms_extractor
changeset 2:aa944e3a353c draft
planemo upload
author | pravs |
---|---|
date | Thu, 03 Aug 2017 13:53:09 -0400 |
parents | f444e529363d |
children | e7c63cfef363 |
files | MSMS_Extractor.py MSMS_Extractor.xml |
diffstat | 2 files changed, 51 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/MSMS_Extractor.py Thu Feb 16 11:56:45 2017 -0500 +++ b/MSMS_Extractor.py Thu Aug 03 13:53:09 2017 -0400 @@ -4,6 +4,7 @@ # University of Minnesota # # +# def main(): from pyteomics import mzml @@ -15,9 +16,15 @@ import pandas as pd from operator import itemgetter from itertools import groupby - if len(sys.argv) >= 5: + import random + + if len(sys.argv) >= 7: # Start of Reading Scans from PSM file # Creating dictionary of PSM file: key = filename key = list of scan numbers + + removeORretain = sys.argv[5].strip() + randomScans = int(sys.argv[6].strip()) + ScanFile = sys.argv[2] spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title']) scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList] @@ -34,7 +41,6 @@ outPath = sys.argv[3] ##outFile = sys.argv[3].split("/")[-1] allScanList = [] - # Read all scan numbers using indexedmzML/indexList/index/offset tags for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'): if re.search("scan=(\d+)", k['idRef']): @@ -44,17 +50,28 @@ fraction_name = sys.argv[4] if scanDict.has_key(fraction_name): - scan2remove = scanDict[fraction_name] + scansInList = scanDict[fraction_name] else: - scan2remove = [] - scan2retain = list(set(allScanList) - set(scan2remove)) - scan2retain.sort() - scansRemoved = list(set(allScanList) - set(scan2retain)) - # scan2retain contains scans that is to be retained + scansInList = [] + scansNotInList = list(set(allScanList) - set(scansInList)) + if removeORretain == "remove": + scan2retain = scansNotInList + scan2retain.sort() + scansRemoved = scansInList + # scan2retain contains scans that is to be retained + + elif removeORretain == "retain": + # Randomly select spectra + random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans))) + + scan2retain = random_scans + scansInList + scan2retain.sort() + scansRemoved = list(set(allScanList) - set(scan2retain)) + # scan2retain contains scans that is to be retained + # Print Stats print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList))) - print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove))) print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain) print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
--- a/MSMS_Extractor.xml Thu Feb 16 11:56:45 2017 -0500 +++ b/MSMS_Extractor.xml Thu Aug 03 13:53:09 2017 -0400 @@ -1,11 +1,11 @@ -<tool id="MSMS_Extractor" name="MSMS_Extractor" version="1.0.0"> - <description>Removes scans with identified PSMs from the mzML file(s).</description> +<tool id="MSMS_Extractor" name="MSMS_Extractor" version="1.1.0"> + <description>Extract scans based on PSM report from the mzML file(s).</description> <requirements> <requirement type="package" version="3.0.9016">proteowizard</requirement> <requirement type="package" version="3.4">pyteomics</requirement> </requirements> - <command interpreter="python"><![CDATA[MSMS_Extractor.py $spectrumfile $psmreportfile $output ${spectrumfile.name.rsplit('.',1)[0]}]]></command> + <command interpreter="python"><![CDATA[MSMS_Extractor.py $spectrumfile $psmreportfile $output ${spectrumfile.name.rsplit('.',1)[0]} $removeretain.doremoveretain $removeretain.num_random_scans]]></command> <inputs> <param name="spectrumfile" type="data" format="mzml"> <label>Input mzML File</label> @@ -13,6 +13,17 @@ <param name="psmreportfile" type="data" format="tabular"> <label>Input PSM Report File</label> </param> + + <conditional name="removeretain"> + <param name="doremoveretain" type="boolean" truevalue="retain" falsevalue="remove" label="Remove or Retain the given Scans" help="Retain=Yes; Remove=No (default)" /> + <when value="remove"> + <param name="num_random_scans" type="hidden" value="0" /> + </when> + <when value="retain"> + <param name="num_random_scans" type="integer" label="Add N random scans in addition to those in the list" value="0" optional="false" /> + </when> + </conditional> + </inputs> <outputs> @@ -21,6 +32,16 @@ <help> -MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and removes it from the mzML file. +MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and gives option to create a new mzml file, either with those scans or without those scans. + +Remove option: +Creates a new mzml file with all the unidentified scans (removes those that are in the PSM report). + +Retain option: +Creates a new mzml file with only those scans that is present in the PSM report. In addition to this, it also has an option to add N number of randomly selected scans to the output mzml file. + + +Please Note: This tool currently works only with PeptideShaker generated PSM report file. + </help> </tool>