# HG changeset patch
# User jay
# Date 1610476809 0
# Node ID 391e7e836fe92dcc3ec1e8fb863a692ab2d4ec61
# Parent f93187136dfbf7407385aeb1d662f534937aa96d
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit 45ebf32dcaa1eed91670d3a2491f9cf3dfb535ef"
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/PDAUG_Merge_Dataframes.py
--- a/PDAUG_Merge_Dataframes/PDAUG_Merge_Dataframes.py Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_Merge_Dataframes/PDAUG_Merge_Dataframes.py Tue Jan 12 18:40:09 2021 +0000
@@ -2,27 +2,38 @@
import pandas as pd
import sys
-files = sys.argv[1]
-out_file = sys.argv[2]
+
+def MergeData(infiles, add_class_label, class_label, OutPut):
+
+ data_frame = pd.DataFrame()
+ if add_class_label == 'True' or add_class_label == 'true':
+ for i, file in enumerate(infiles.split(',')):
+ df1 = pd.read_csv(file,sep='\t')
+ df2 = pd.DataFrame(df1.shape[0]*[i], columns=[class_label])
+ df3 = pd.concat([df1,df2], axis=1)
+ data_frame = pd.concat([data_frame,df3])
+ final_DF = data_frame.fillna(0)
-data_frame = pd.read_csv(files.split(',')[0],sep='\t')
+ else:
+
+ for file in infiles.split(','):
+ df1 = pd.read_csv(file,sep='\t')
+ data_frame = pd.concat([data_frame,df1])
+ final_DF = data_frame.fillna(0)
+
+ final_DF.to_csv(OutPut, sep="\t", index=False)
-for file in files.split(',')[1:]:
-
- df1 = pd.read_csv(file,sep='\t')
- data_frame = pd.concat([data_frame,df1])
-
-final_DF = data_frame.fillna(0)
-
-final_DF.to_csv(out_file,sep="\t", index=False)
+if __name__=="__main__":
-
-
-
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-I", "--infiles", required=True, default=None, help=".tsv")
+ parser.add_argument("-L", "--add_class_label", required=False, default=False, help="Path to target tsv file")
+ parser.add_argument("-C", "--class_label", required=False, default='class_label', help="Path to target tsv file")
+ parser.add_argument("-O", "--OutPut", required=False, default='Out.tsv', help="Path to target tsv file")
-
+ args = parser.parse_args()
-
-
+ MergeData(args.infiles, args.add_class_label, args.class_label, args.OutPut)
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/1.tsv
--- a/PDAUG_Merge_Dataframes/test-data/1.tsv Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_Merge_Dataframes/test-data/1.tsv Tue Jan 12 18:40:09 2021 +0000
@@ -1,2 +1,2 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
+NAME COL1 COL2 COL3 COL4 COL5
+1 15 10 12 5 3
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/2.tsv
--- a/PDAUG_Merge_Dataframes/test-data/2.tsv Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_Merge_Dataframes/test-data/2.tsv Tue Jan 12 18:40:09 2021 +0000
@@ -1,2 +1,2 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
+NAME COL1 COL2 COL3 COL4 COL5
+2 15 10 12 5 3
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/3.tsv
--- a/PDAUG_Merge_Dataframes/test-data/3.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/4.tsv
--- a/PDAUG_Merge_Dataframes/test-data/4.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/5.tsv
--- a/PDAUG_Merge_Dataframes/test-data/5.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/6.tsv
--- a/PDAUG_Merge_Dataframes/test-data/6.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.781 0.608 0.537 0.608
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/out.tsv
--- a/PDAUG_Merge_Dataframes/test-data/out.tsv Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_Merge_Dataframes/test-data/out.tsv Tue Jan 12 18:40:09 2021 +0000
@@ -1,7 +1,3 @@
-Algo accuracy presision recall f1 mean_auc
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
-SVMC 0.608 0.7809999999999999 0.608 0.537 0.608
+NAME COL1 COL2 COL3 COL4 COL5
+1 15 10 12 5 3
+2 15 10 12 5 3
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_Merge_Dataframes/test-data/out1.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PDAUG_Merge_Dataframes/test-data/out1.tsv Tue Jan 12 18:40:09 2021 +0000
@@ -0,0 +1,3 @@
+NAME COL1 COL2 COL3 COL4 COL5 class_label
+1 15 10 12 5 3 0
+2 15 10 12 5 3 1
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.py
--- a/PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.py Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.py Tue Jan 12 18:40:09 2021 +0000
@@ -1,64 +1,72 @@
-import os
-import argparse
+
+import pandas as pd
+
+
+def TSVtoFASTA(infile, method, firstdatafile, seconddatafile, outfile, clmpepid, slcclasslabel, peps):
+
+
+ fn = [firstdatafile, seconddatafile]
-def TSVtoFASTA(InFile, Method, Positive, Negative, OutFile):
-
- if Method == 'WithClassLabel':
-
- f = open(InFile)
- lines = f.readlines()
-
- of1 = open(Positive,'w')
- of2 = open(Negative,'w')
-
- n = 0
- m = 0
-
- l = []
-
- for line in lines[1:]:
- l.append(line.split('\t')[1].strip('\n').strip('\r'))
- l = list(set(l))
-
- print(l)
-
- for line in lines:
+ df = pd.read_csv(infile, sep="\t")
+ if clmpepid == None:
+ pass
+ else:
+ names = df[clmpepid].tolist()
- if l[1] in line.split('\t')[1].strip('\n').strip('\r'):
- n= n+1
- of1.write('>peptide_'+str(n)+'_'+str(l[1])+'\n')
- of1.write(line.split('\t')[0]+'\n')
-
- if l[0] in line.split('\t')[1].strip('\n').strip('\r'):
- m= m+1
- of2.write('>peptide_'+str(m)+'_'+str(l[0])+'\n')
- of2.write(line.split('\t')[0]+'\n')
+ peps = df[peps].tolist()
+
+ if method == "withoutlabel":
+ f = open(outfile,'w')
+ if clmpepid is not None:
+ for i,n in enumerate(peps):
+ f.write(">"+names[i]+'\n')
+ f.write(n+'\n')
+ f.close()
+ else:
+ for i,n in enumerate(peps):
+ f.write(">"+str(i)+'\n')
+ f.write(n+'\n')
+ f.close()
+
+ elif method == "withlabel":
+ labels = df[slcclasslabel].tolist()
- elif Method == 'NoClassLabel':
-
- f = open(InFile)
- lines = f.readlines()
- of1 = open(OutFile,'w')
-
- for i, line in enumerate(lines[1:]):
- of1.write('>peptide_'+str(i)+'\n')
- of1.write(line.split('\t')[0]+'\n')
-
- else:
- pass
+ label = list(set(labels))
+
+ if clmpepid is None:
+ for i, l in enumerate(label):
+ f = open(fn[i],'w')
+ print('ok1')
+ for i, L in enumerate(labels):
+ if l == L:
+ f.write(">"+str(i)+"_"+str(l)+'\n')
+ f.write(peps[i]+'\n')
+ f.close()
+ else:
+ for i, l in enumerate(label):
+ f = open(fn[i],'w')
+ for i, L in enumerate(labels):
+ if l == L:
+ f.write(">"+names[i]+"_"+l+'\n')
+ f.write(peps[i]+'\n')
+ f.close()
if __name__=="__main__":
import argparse
-
parser = argparse.ArgumentParser()
-
parser.add_argument("-I", "--InFile", required=True, default=None, help=".fasta or .tsv")
- parser.add_argument("-P", "--Postvs", required=False, default='FirstDataFile.fasta', help="Path to target tsv file")
- parser.add_argument("-N", "--Negtvs", required=False, default='SecondDataFile.fasta', help="Path to target tsv file")
+ parser.add_argument("-F", "--FirstDataFile", required=False, default='FirstDataFile.fasta', help="Path to target tsv file")
+ parser.add_argument("-S", "--SecondDataFile", required=False, default='SecondDataFile.fasta', help="Path to target tsv file")
parser.add_argument("-O", "--OutFile", required=False, default='OutFile.fasta', help="Path to target tsv file")
parser.add_argument("-M", "--Method", required=True, default=None, help="Path to target tsv file")
+ parser.add_argument("-C", "--ClmPepID", required=False, default=None, help="Peptide Column Name")
+ parser.add_argument("-L", "--SlcClassLabel", required=False, default="Class_label", help="Class Label Column Name")
+ parser.add_argument("-P", "--PeptideColumn", required=True, default=None, help="Class Label Column Name")
args = parser.parse_args()
- TSVtoFASTA(args.InFile, args.Method, args.Postvs, args.Negtvs, args.OutFile)
\ No newline at end of file
+ TSVtoFASTA(args.InFile, args.Method, args.FirstDataFile, args.SecondDataFile, args.OutFile, args.ClmPepID, args.SlcClassLabel, args.PeptideColumn)
+
+
+
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.xml
--- a/PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.xml Wed Dec 30 02:42:16 2020 +0000
+++ b/PDAUG_TSVtoFASTA/PDAUG_TSVtoFASTA.xml Tue Jan 12 18:40:09 2021 +0000
@@ -3,60 +3,110 @@
Converts tabular peptide sequence data into fasta format
- pandas
- modlamp
+ pandas
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
- Method == "WithClassLabel"
+
+ selmethod['method'] == "withoutlabel"
-
- Method == "WithClassLabel"
+
+ selmethod['method'] == "withlabel"
-
- Method == "NoClassLabel"
+
+ selmethod['method'] == "withlabel"
+
+
-
-
-
-
+
+
+
+
+
-
-
-
+
+
+
+
+
12_AMP
+GLFDIVKKVVGALG
+>13_AMP
+KLLKLLKKKLLK
+>14_AMP
+KLLLLKLLK
+>15_AMP
+GLFDIVKKVVGALG
+>16_AMP
+GLFDIVKKVVGALG
+>17_AMP
+KLLKLLKKKLLK
+>18_AMP
+KLLLLKLLK
+>19_AMP
+GLFDIVKKVVGALG
+>20_AMP
+KLLKLLKKKLLK
+>21_AMP
+KLLLLKLLK
+>22_AMP
+GLFDIVKKVVGALG
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/2.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PDAUG_TSVtoFASTA/test-data/2.fasta Tue Jan 12 18:40:09 2021 +0000
@@ -0,0 +1,24 @@
+>0_TM
+GLFDIVKKVVGALG
+>1_TM
+KLLKLLKKKLLK
+>2_TM
+KLLLLKLLK
+>3_TM
+GLFDIVKKVVGALG
+>4_TM
+GLFDIVKKVVGALG
+>5_TM
+KLLKLLKKKLLK
+>6_TM
+KLLLLKLLK
+>7_TM
+GLFDIVKKVVGALG
+>8_TM
+GLFDIVKKVVGALG
+>9_TM
+KLLKLLKKKLLK
+>10_TM
+KLLLLKLLK
+>11_TM
+GLFDIVKKVVGALG
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/out.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PDAUG_TSVtoFASTA/test-data/out.fasta Tue Jan 12 18:40:09 2021 +0000
@@ -0,0 +1,46 @@
+>0
+GLFDIVKKVVGALG
+>1
+KLLKLLKKKLLK
+>2
+KLLLLKLLK
+>3
+GLFDIVKKVVGALG
+>4
+GLFDIVKKVVGALG
+>5
+KLLKLLKKKLLK
+>6
+KLLLLKLLK
+>7
+GLFDIVKKVVGALG
+>8
+GLFDIVKKVVGALG
+>9
+KLLKLLKKKLLK
+>10
+KLLLLKLLK
+>11
+GLFDIVKKVVGALG
+>12
+GLFDIVKKVVGALG
+>13
+KLLKLLKKKLLK
+>14
+KLLLLKLLK
+>15
+GLFDIVKKVVGALG
+>16
+GLFDIVKKVVGALG
+>17
+KLLKLLKKKLLK
+>18
+KLLLLKLLK
+>19
+GLFDIVKKVVGALG
+>20
+KLLKLLKKKLLK
+>21
+KLLLLKLLK
+>22
+GLFDIVKKVVGALG
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/test.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PDAUG_TSVtoFASTA/test-data/test.tsv Tue Jan 12 18:40:09 2021 +0000
@@ -0,0 +1,24 @@
+Name Peptides Class_label
+Pep1 GLFDIVKKVVGALG TM
+Pep2 KLLKLLKKKLLK TM
+Pep3 KLLLLKLLK TM
+Pep4 GLFDIVKKVVGALG TM
+Pep5 GLFDIVKKVVGALG TM
+Pep6 KLLKLLKKKLLK TM
+Pep7 KLLLLKLLK TM
+Pep8 GLFDIVKKVVGALG TM
+Pep9 GLFDIVKKVVGALG TM
+Pep10 KLLKLLKKKLLK TM
+Pep11 KLLLLKLLK TM
+Pep12 GLFDIVKKVVGALG TM
+Pep13 GLFDIVKKVVGALG AMP
+Pep14 KLLKLLKKKLLK AMP
+Pep15 KLLLLKLLK AMP
+Pep16 GLFDIVKKVVGALG AMP
+Pep17 GLFDIVKKVVGALG AMP
+Pep18 KLLKLLKKKLLK AMP
+Pep19 KLLLLKLLK AMP
+Pep20 GLFDIVKKVVGALG AMP
+Pep21 KLLKLLKKKLLK AMP
+Pep22 KLLLLKLLK AMP
+Pep23 GLFDIVKKVVGALG AMP
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/test1.tsv
--- a/PDAUG_TSVtoFASTA/test-data/test1.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-Peptides Class_label
-GLFDIVKKVVGALG 0
-KLLKLLKKKLLK 0
-KLLLLKLLK 0
-GLFDIVKKVVGALG 0
-GLFDIVKKVVGALG 0
-KLLKLLKKKLLK 0
-KLLLLKLLK 0
-GLFDIVKKVVGALG 0
-GLFDIVKKVVGALG 0
-KLLKLLKKKLLK 0
-KLLLLKLLK 0
-GLFDIVKKVVGALG 0
-GLFDIVKKVVGALG 1
-KLLKLLKKKLLK 1
-KLLLLKLLK 1
-GLFDIVKKVVGALG 1
-GLFDIVKKVVGALG 1
-KLLKLLKKKLLK 1
-KLLLLKLLK 1
-GLFDIVKKVVGALG 1
-KLLKLLKKKLLK 1
-KLLLLKLLK 1
-GLFDIVKKVVGALG 1
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/test2.tsv
--- a/PDAUG_TSVtoFASTA/test-data/test2.tsv Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-Peptides
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
-KLLKLLKKKLLK
-KLLLLKLLK
-GLFDIVKKVVGALG
\ No newline at end of file
diff -r f93187136dfb -r 391e7e836fe9 PDAUG_TSVtoFASTA/test-data/test2/Out.fasta
--- a/PDAUG_TSVtoFASTA/test-data/test2/Out.fasta Wed Dec 30 02:42:16 2020 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
->peptide_0
-GLFDIVKKVVGALG
-
->peptide_1
-KLLKLLKKKLLK
-
->peptide_2
-KLLLLKLLK
-
->peptide_3
-GLFDIVKKVVGALG
-
->peptide_4
-GLFDIVKKVVGALG
-
->peptide_5
-KLLKLLKKKLLK
-
->peptide_6
-KLLLLKLLK
-
->peptide_7
-GLFDIVKKVVGALG
-
->peptide_8
-GLFDIVKKVVGALG
-
->peptide_9
-KLLKLLKKKLLK
-
->peptide_10
-KLLLLKLLK
-
->peptide_11
-GLFDIVKKVVGALG
-
->peptide_12
-GLFDIVKKVVGALG
-
->peptide_13
-KLLKLLKKKLLK
-
->peptide_14
-KLLLLKLLK
-
->peptide_15
-GLFDIVKKVVGALG
-
->peptide_16
-GLFDIVKKVVGALG
-
->peptide_17
-KLLKLLKKKLLK
-
->peptide_18
-KLLLLKLLK
-
->peptide_19
-GLFDIVKKVVGALG
-
->peptide_20
-KLLKLLKKKLLK
-
->peptide_21
-KLLLLKLLK
-
->peptide_22
-GLFDIVKKVVGALG
-