Mercurial > repos > ynewton > extract_tumor_vs_normal_tcga_ids
view get_normal_vs_tumor_sample_ids.py @ 2:82e5ee13fe5a draft
Uploaded
author | ynewton |
---|---|
date | Thu, 17 Jan 2013 20:18:46 -0500 |
parents | 72b0123a8587 |
children |
line wrap: on
line source
import optparse, re parser = optparse.OptionParser() parser.add_option("--in_matrix", dest="in_matrix", action="store", default="", help="") parser.add_option("--out_normals", dest="out_normals", action="store", default="", help="") parser.add_option("--out_tumor", dest="out_tumor", action="store", default="", help="") opts, args = parser.parse_args() #process input arguments: input_expression_file_name = opts.in_matrix output_normal_file_name = opts.out_normals output_tumor_file_name = opts.out_tumor normal_sample_re = re.compile('TCGA-\w\w-\w\w\w\w-1\d\w-.*') tumor_sample_re = re.compile("TCGA-\w\w-\w\w\w\w-0\d\w-.*") #TCGA-A2-A0D2-01A-21R-A034-07 normal_samples = [] tumor_samples = [] expression_file = open(input_expression_file_name, 'r') line_number = 0 for line in expression_file: line_elems = line.strip().split("\t") if line_number == 0: #this is the header line, need to figure out what samples are normal and which are tumor elem_counter = 1 for elem in line_elems[1:]: normal_re_match = normal_sample_re.match(elem) if normal_re_match: normal_samples.append(elem) tumor_re_match = tumor_sample_re.match(elem) if tumor_re_match: tumor_samples.append(elem) elem_counter += 1 else: break line_number += 1 expression_file.close() output_normal_file = open(output_normal_file_name, 'w') print >> output_normal_file, "\n".join(normal_samples) output_normal_file.close() output_tumor_file = open(output_tumor_file_name, 'w') print >> output_tumor_file, "\n".join(tumor_samples) output_tumor_file.close()