Mercurial > repos > davidvanzessen > demultiplex_emc
comparison demultiplex.py @ 3:b6d63b9efb8f draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Fri, 09 Nov 2018 05:52:15 -0500 |
| parents | 36c79869620b |
| children | 146bbd9d58f6 |
comparison
equal
deleted
inserted
replaced
| 2:500c2eee063d | 3:b6d63b9efb8f |
|---|---|
| 15 def sniff_format(file_path): | 15 def sniff_format(file_path): |
| 16 """ | 16 """ |
| 17 Try to guess the file format (fastq/fasta) by looking at the first character of the first line. | 17 Try to guess the file format (fastq/fasta) by looking at the first character of the first line. |
| 18 Should be '@' for fastq and '>' for fasta. | 18 Should be '@' for fastq and '>' for fasta. |
| 19 """ | 19 """ |
| 20 with open(file_path, 'rU') as file_handle: | 20 with open(file_path, 'r') as file_handle: |
| 21 for line in file_handle: | 21 for line in file_handle: |
| 22 if line.startswith("@"): | 22 if line.startswith("@"): |
| 23 return "fastq" | 23 return "fastq" |
| 24 if line.startswith(">"): | 24 if line.startswith(">"): |
| 25 return "fasta" | 25 return "fasta" |
| 26 break | 26 break |
| 27 return None | 27 return None |
| 28 | 28 |
| 29 | 29 |
| 30 def search_barcode_in_first_half(sequence, barcode): | 30 def search_barcode_in_first_half(sequence, barcode): |
| 31 if type(sequence) is Seq: | 31 if type(sequence) is Seq: |
| 32 sequence = str(sequence) | 32 sequence = str(sequence) |
| 33 elif type(sequence) is SeqRecord: | 33 elif type(sequence) is SeqRecord: |
| 34 sequence = str(sequence.seq) | 34 sequence = str(sequence.seq) |
| 146 ) | 146 ) |
| 147 | 147 |
| 148 total_sequences = 0 | 148 total_sequences = 0 |
| 149 sequences_assigned_by_id = defaultdict(int) | 149 sequences_assigned_by_id = defaultdict(int) |
| 150 | 150 |
| 151 with open(input_file_path, 'rU') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle: | 151 with open(input_file_path, 'r') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle: |
| 152 for record in SeqIO.parse(input_file_handle, input_format): | 152 for record in SeqIO.parse(input_file_handle, input_format): |
| 153 total_sequences += 1 | 153 total_sequences += 1 |
| 154 for ID, barcode_datas in barcode_data_dict.items(): | 154 for ID, barcode_datas in barcode_data_dict.items(): |
| 155 barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record) | 155 barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record) |
| 156 if barcode_position == -1: | 156 if barcode_position == -1: |
