Mercurial > repos > davidvanzessen > argalaxy_tools
comparison experimental_design/experimental_design.py @ 0:afe85eb6572e draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 29 Aug 2016 05:41:20 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:afe85eb6572e |
|---|---|
| 1 import sys | |
| 2 import pandas as pd | |
| 3 | |
| 4 def main(): | |
| 5 patients = {} | |
| 6 files = [] | |
| 7 sample_id = sys.argv[1] | |
| 8 imgt_files = 0 | |
| 9 blast_files = 0 | |
| 10 #organize files | |
| 11 for arg in sys.argv[2:-2]: | |
| 12 if arg.find("/") is -1: | |
| 13 patients[sample_id] = files | |
| 14 files = [] | |
| 15 sample_id = arg | |
| 16 else: | |
| 17 df = pd.read_csv(arg, sep="\t", dtype=object, error_bad_lines=False) | |
| 18 if "Functionality" in list(df.columns.values): | |
| 19 df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon" | |
| 20 imgt_files += 1 | |
| 21 else: | |
| 22 blast_files += 1 | |
| 23 files.append(df) | |
| 24 patients[sample_id] = files | |
| 25 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', | |
| 26 u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', | |
| 27 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', | |
| 28 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', | |
| 29 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate'] | |
| 30 if "N-REGION-nt nb" in files[0].columns: | |
| 31 columns.insert(30, "N-REGION-nt nb") | |
| 32 if blast_files is not 0: | |
| 33 print "Has a parsed blastn file, using limited columns." | |
| 34 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate'] | |
| 35 | |
| 36 result = None | |
| 37 for patient_id, samples in patients.iteritems(): | |
| 38 count = 1 | |
| 39 for sample in samples: | |
| 40 sample['Sample'] = patient_id | |
| 41 sample['Replicate'] = str(count) | |
| 42 count += 1 | |
| 43 if result is None: | |
| 44 result = sample[columns] | |
| 45 else: | |
| 46 result = result.append(sample[columns]) | |
| 47 result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index") | |
| 48 | |
| 49 if __name__ == "__main__": | |
| 50 main() |
