# HG changeset patch # User Jim Johnson # Date 1407960093 18000 # Node ID e83e0ce8fb68ca1da0fc59519f7a5e012fae79af # Parent e7e56b51d156e6e4fba4755c88bc96a32fb545cd Add option to reverse the search, find reference field in input field diff -r e7e56b51d156 -r e83e0ce8fb68 find_in_reference.py --- a/find_in_reference.py Wed Feb 05 08:12:47 2014 -0500 +++ b/find_in_reference.py Wed Aug 13 15:01:33 2014 -0500 @@ -41,6 +41,7 @@ parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) + parser.add_option( '-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string' ) parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' ) parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' ) @@ -108,12 +109,14 @@ refFile = open(options.reference,'r') for tn,fline in enumerate(refFile): fields = fline.split('\t') - target_string =fields[refcol] + target_string = fields[refcol].rstrip('\r\n') if options.ignore_case: target_string = target_string.upper() + search = search_string if not options.reverse_find else target_string + target = target_string if not options.reverse_find else search_string if options.debug: - print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string) - if search_string in target_string: + print >> sys.stderr, "in: %s %s %s" % (search,search in target,target) + if search in target: found = True if annotate: annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns]) diff -r e7e56b51d156 -r e83e0ce8fb68 find_in_reference.xml --- a/find_in_reference.xml Wed Feb 05 08:12:47 2014 -0500 +++ b/find_in_reference.xml Wed Aug 13 15:01:33 2014 -0500 @@ -1,5 +1,5 @@ - + filter peptides that are present in proteins find_in_reference.py --input "$input" --reference "$reference" @@ -7,7 +7,7 @@ --input_column $column.input_column --reference_column $column.reference_column #end if - $case_insensitive + $case_insensitive $reverse_find #if 'novel' in $outputs.__str__ or not 'found' in $outputs.__str__: --output "$novel" #end if @@ -43,6 +43,7 @@ + @@ -100,6 +101,13 @@ + + + + + + + **Find in Reference** diff -r e7e56b51d156 -r e83e0ce8fb68 test-data/found_proteins.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/found_proteins.tabular Wed Aug 13 15:01:33 2014 -0500 @@ -0,0 +1,3 @@ +sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCVSRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIGPLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSREKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS +sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA