view experimental_design/experimental_design.py @ 7:54f6756bacb1 draft

Uploaded
author davidvanzessen
date Fri, 16 Dec 2016 09:01:40 -0500
parents afe85eb6572e
children
line wrap: on
line source

import sys
import pandas as pd

def main():
	patients = {}
	files = []
	sample_id = sys.argv[1]
	imgt_files = 0
	blast_files = 0
	#organize files
	for arg in sys.argv[2:-2]:
		if arg.find("/") is -1:
			patients[sample_id] = files
			files = []
			sample_id = arg
		else:
			df = pd.read_csv(arg, sep="\t", dtype=object, error_bad_lines=False)
			if "Functionality" in list(df.columns.values):
				df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon"
				imgt_files += 1
			else:
				blast_files += 1
			files.append(df)
	patients[sample_id] = files
	columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', 
			   u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 
			   'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 
			   'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 
			   'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate']
	if "N-REGION-nt nb" in files[0].columns:
		columns.insert(30, "N-REGION-nt nb")
	if blast_files is not 0:
		print "Has a parsed blastn file, using limited columns."
		columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate']

	result = None
	for patient_id, samples in patients.iteritems():
		count = 1
		for sample in samples:
			sample['Sample'] = patient_id
			sample['Replicate'] = str(count)
			count += 1
			if result is None:
				result = sample[columns]
			else:
				result = result.append(sample[columns])
	result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index")

if __name__ == "__main__":
	main()