# HG changeset patch # User galaxyp # Date 1504250094 14400 # Node ID 77ddaee887a85f99ba72bc629f6656a5472a846c # Parent 8a30d6e5b97ddcf2a9c6ed02f787b233c921f954 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit 71a4265d11aef48342142b8cf2caa86f79f9a554 diff -r 8a30d6e5b97d -r 77ddaee887a8 __pycache__/peptide_pi_annotator.cpython-36.pyc Binary file __pycache__/peptide_pi_annotator.cpython-36.pyc has changed diff -r 8a30d6e5b97d -r 77ddaee887a8 align_dbspec.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/align_dbspec.py Fri Sep 01 03:14:54 2017 -0400 @@ -0,0 +1,127 @@ +#!/usr/bin/env python +import sys +import os +import argparse +import re +from Bio import SeqIO + + +def create_spectra_maps(specfiles, dbfiles, frregex, firstfr): + """Output something like + {'fr01', 'fr04'} # Normal filename set + and + {'fr03': ['fr02', 'fr03']} # pool definition + and + {'fr04': 'fr04', 'fr04b': 'fr04'} # rerun fraction, rerun may also be pool + """ + specrange = get_fn_fractionmap(specfiles, frregex) + to_pool = [] + poolmap, rerun_map, normal_fns = {}, [], set() + for i in range(0, len(dbfiles)): + num = i + firstfr + if num not in specrange: + to_pool.append(i) + elif to_pool and num in specrange: + to_pool.append(i) + poolmap[specrange[num][0]] = to_pool + to_pool = [] + if not to_pool and specrange[num][0] in poolmap: + if poolmap[specrange[num][0]][-1] != i: + normal_fns.add((dbfiles[num - 1], + specfiles[specrange[num][0]])) + elif not to_pool: + normal_fns.add((dbfiles[num - 1], specfiles[specrange[num][0]])) + for num in sorted(specrange.keys()): + if len(specrange[num]) > 1: + rerun_map.append(specrange[num]) + return normal_fns, rerun_map, poolmap + + +def get_fn_fractionmap(files, frregex): + fnfrmap = {} + for f_ix, fn in enumerate(files): + fnum = int(re.sub(frregex, '\\1', fn)) + try: + fnfrmap[fnum].append(f_ix) + except KeyError: + fnfrmap[fnum] = [f_ix] + return fnfrmap + + +def pool_fasta_files(poolfiles): + acc_seq = {} + for fr in poolfiles: + for seq in SeqIO.parse(fr, 'fasta'): + sequence = str(seq.seq.upper()) + try: + if sequence in acc_seq[seq.id]: + continue + except KeyError: + acc_seq[seq.id] = {sequence: 1} + yield seq + else: + acc_seq[seq.id][sequence] = 1 + yield seq + + +def write_pooled_fasta(poolmap, specnames, dbfiles): + """Runs through poolmap and pooles output files, filtering out + duplicates""" + for outfr, infrs in poolmap.items(): + outfn = os.path.join('aligned_out', os.path.basename(specnames[outfr])) + print('Pooling FASTA files {} - {} into: {}'.format( + dbfiles[infrs[0]], dbfiles[infrs[-1]], outfn)) + with open(outfn, 'w') as fp: + SeqIO.write(pool_fasta_files([dbfiles[x] for x in infrs]), fp, + 'fasta') + + +def write_nonpooled_fasta(fractions): + """Symlinks nonpooled db files""" + print('Symlinking non-pooled non-rerun files', + [(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) + for fr in fractions]) + [os.symlink(fr[0], os.path.join('aligned_out', os.path.basename(fr[1]))) + for fr in fractions] + + +def copy_rerun_fasta(rerun_map, specnames): + for dst_indices in rerun_map: + src = os.path.join(specnames[dst_indices[0]]) + for outfn in [specnames[x] for x in dst_indices[1:]]: + print('Symlinking {} to {}'.format(src, outfn)) + os.symlink(src, os.path.join('aligned_out', outfn)) + + +def main(): + args = parse_commandline() + with open(args.spectranames) as fp: + spectranames = [x.strip() for x in fp.read().strip().split('\n')] + vanilla_fr, rerun_map, poolmap = create_spectra_maps(spectranames, + args.dbfiles, + args.frspecregex, + args.firstfr) + write_pooled_fasta(poolmap, spectranames, args.dbfiles) + write_nonpooled_fasta(vanilla_fr) + copy_rerun_fasta(rerun_map, spectranames) + + +def parse_commandline(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--specnames', dest='spectranames', help='File ' + 'containing spectra filenames with fractions. ' + 'Test data example illustrates reruns (fr03b, 09b) and' + ' pooled samples (fr05-09 are inside fr09 and fr09b).', + required=True) + parser.add_argument('--dbfiles', dest='dbfiles', help='FASTA db files', + nargs='+', required=True) + parser.add_argument('--frspec', dest='frspecregex', help='Fraction regex ' + 'to detect spectra fraction numbers', required=True) + parser.add_argument('--firstfr', dest='firstfr', help='First fraction nr', + type=int, required=True) + return parser.parse_args(sys.argv[1:]) + + +if __name__ == '__main__': + main() diff -r 8a30d6e5b97d -r 77ddaee887a8 delta_pi_calc.xml --- a/delta_pi_calc.xml Mon Jul 24 05:25:22 2017 -0400 +++ b/delta_pi_calc.xml Fri Sep 01 03:14:54 2017 -0400 @@ -1,9 +1,9 @@ - + + to peptide table python - to peptide table - + python '$__tool_directory__/peptide_pi_annotator.py' -i '$trainingpi' -p '$peptable' --out '$output' #if $stripcol --stripcol $stripcol diff -r 8a30d6e5b97d -r 77ddaee887a8 pi_db_split.xml --- a/pi_db_split.xml Mon Jul 24 05:25:22 2017 -0400 +++ b/pi_db_split.xml Fri Sep 01 03:14:54 2017 -0400 @@ -1,10 +1,10 @@ - + into pI separated fractions numpy python - + + to resemble spectra fraction scheme + + python + biopython + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Filters, pools and doubles fractionated databases with a set of identically fractionated spectra files which have been + subjected to pooling and contain reruns. + You may have fraction 1-10 in databases but spectra file fractions 4-7 have been pooled before loading to the MS, + and spectra fraction 2 and 8 have been reran creating fractions 2 and 2a, and 8, 8a and 8b. + This tool pools FASTA databases and duplicates them where needed to line up the databases to your spectra collections. + + + diff -r 8a30d6e5b97d -r 77ddaee887a8 test-data/merged_twice_decoy_fr1-3.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/merged_twice_decoy_fr1-3.fasta Fri Sep 01 03:14:54 2017 -0400 @@ -0,0 +1,10 @@ +>decoy_protein1 +TFSLFGCSIPNTNVEFSIKLFDVCLLLCNCLFSLIIMIYVII +>decoy_protein2 +TFSLFGCSIPNTNVEFSI +>decoy_protein1 +LNLSKPILSEST +>decoy_protein3 +LFDVCLLLCNCLFSLIIMIYVIIK +>decoy_protein2 +LFDVCLLLCNCLFSLIIMIYVIIKLWLFK diff -r 8a30d6e5b97d -r 77ddaee887a8 test-data/specnames.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/specnames.txt Fri Sep 01 03:14:54 2017 -0400 @@ -0,0 +1,6 @@ +spec_f01.mzML +spec_f02.mzML +spec_f03.mzML +spec_f03b.mzML +spec_f09.mzML +spec_f09b.mzML