Mercurial > repos > iuc > extract_genomic_dna
comparison extract_genomic_dna.py @ 11:80414c33a59a draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit 6db2d98b513e4980788fcba49d809c91e5750296
| author | iuc |
|---|---|
| date | Thu, 21 Nov 2024 07:20:29 +0000 |
| parents | e400dcbc60d0 |
| children |
comparison
equal
deleted
inserted
replaced
| 10:5cc8e93ee98f | 11:80414c33a59a |
|---|---|
| 9 from bx.intervals.io import Comment, Header | 9 from bx.intervals.io import Comment, Header |
| 10 | 10 |
| 11 import extract_genomic_dna_utils as egdu # noqa: I100,I202 | 11 import extract_genomic_dna_utils as egdu # noqa: I100,I202 |
| 12 | 12 |
| 13 parser = argparse.ArgumentParser() | 13 parser = argparse.ArgumentParser() |
| 14 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") | 14 parser.add_argument("--input_format", dest="input_format", help="Input dataset format") |
| 15 parser.add_argument('--input', dest='input', help="Input dataset") | 15 parser.add_argument("--input", dest="input", help="Input dataset") |
| 16 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") | 16 parser.add_argument("--genome", dest="genome", help="Input dataset genome build") |
| 17 parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") | 17 parser.add_argument( |
| 18 parser.add_argument('--columns', dest='columns', help="Columns to use in input file") | 18 "--interpret_features", |
| 19 parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") | 19 dest="interpret_features", |
| 20 parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file") | 20 default=None, |
| 21 parser.add_argument('--output_format', dest='output_format', help="Output format") | 21 help="Interpret features if input format is gff", |
| 22 parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format") | 22 ) |
| 23 parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter") | 23 parser.add_argument("--columns", dest="columns", help="Columns to use in input file") |
| 24 parser.add_argument('--output', dest='output', help="Output dataset") | 24 parser.add_argument( |
| 25 "--reference_genome_source", | |
| 26 dest="reference_genome_source", | |
| 27 help="Source of reference genome file", | |
| 28 ) | |
| 29 parser.add_argument( | |
| 30 "--reference_genome", dest="reference_genome", help="Reference genome file" | |
| 31 ) | |
| 32 parser.add_argument("--output_format", dest="output_format", help="Output format") | |
| 33 parser.add_argument( | |
| 34 "--fasta_header_type", | |
| 35 dest="fasta_header_type", | |
| 36 default=None, | |
| 37 help="Fasta header format", | |
| 38 ) | |
| 39 parser.add_argument( | |
| 40 "--fasta_header_delimiter", | |
| 41 dest="fasta_header_delimiter", | |
| 42 default=None, | |
| 43 help="Fasta header field delimiter", | |
| 44 ) | |
| 45 parser.add_argument("--output", dest="output", help="Output dataset") | |
| 25 args = parser.parse_args() | 46 args = parser.parse_args() |
| 26 | 47 |
| 27 input_is_gff = args.input_format == 'gff' | 48 input_is_gff = args.input_format == "gff" |
| 28 interpret_features = input_is_gff and args.interpret_features == "yes" | 49 interpret_features = input_is_gff and args.interpret_features == "yes" |
| 29 if len(args.columns.split(',')) == 5: | 50 if len(args.columns.split(",")) == 5: |
| 30 # Bed file. | 51 # Bed file. |
| 31 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) | 52 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg( |
| 53 args.columns | |
| 54 ) | |
| 32 else: | 55 else: |
| 33 # Gff file. | 56 # Gff file. |
| 34 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) | 57 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) |
| 35 name_col = False | 58 name_col = False |
| 36 | 59 |
| 45 nibs = {} | 68 nibs = {} |
| 46 skipped_lines = 0 | 69 skipped_lines = 0 |
| 47 first_invalid_line = 0 | 70 first_invalid_line = 0 |
| 48 invalid_lines = [] | 71 invalid_lines = [] |
| 49 warnings = [] | 72 warnings = [] |
| 50 warning = '' | 73 warning = "" |
| 51 twobitfile = None | 74 twobitfile = None |
| 52 line_count = 1 | 75 line_count = 1 |
| 53 file_iterator = open(args.input) | 76 file_iterator = open(args.input) |
| 54 if interpret_features: | 77 if interpret_features: |
| 55 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) | 78 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) |
| 56 out = open(args.output, 'wt') | 79 out = open(args.output, "wt") |
| 57 | 80 |
| 58 for feature in file_iterator: | 81 for feature in file_iterator: |
| 59 # Ignore comments, headers. | 82 # Ignore comments, headers. |
| 60 if isinstance(feature, (Header, Comment)): | 83 if isinstance(feature, (Header, Comment)): |
| 61 line_count += 1 | 84 line_count += 1 |
| 68 start = feature.start | 91 start = feature.start |
| 69 end = feature.end | 92 end = feature.end |
| 70 strand = feature.strand | 93 strand = feature.strand |
| 71 else: | 94 else: |
| 72 # Processing lines, either interval or GFF format. | 95 # Processing lines, either interval or GFF format. |
| 73 line = feature.rstrip('\r\n') | 96 line = feature.rstrip("\r\n") |
| 74 if line and not line.startswith("#"): | 97 if line and not line.startswith("#"): |
| 75 fields = line.split('\t') | 98 fields = line.split("\t") |
| 76 try: | 99 try: |
| 77 chrom = fields[chrom_col] | 100 chrom = fields[chrom_col] |
| 78 start = int(fields[start_col]) | 101 start = int(fields[start_col]) |
| 79 end = int(fields[end_col]) | 102 end = int(fields[end_col]) |
| 80 if name_col: | 103 if name_col: |
| 97 if not invalid_lines: | 120 if not invalid_lines: |
| 98 invalid_lines = egdu.get_lines(feature) | 121 invalid_lines = egdu.get_lines(feature) |
| 99 first_invalid_line = line_count | 122 first_invalid_line = line_count |
| 100 skipped_lines += len(invalid_lines) | 123 skipped_lines += len(invalid_lines) |
| 101 continue | 124 continue |
| 102 if strand not in ['+', '-']: | 125 if strand not in ["+", "-"]: |
| 103 strand = '+' | 126 strand = "+" |
| 104 sequence = '' | 127 sequence = "" |
| 105 else: | 128 else: |
| 106 continue | 129 continue |
| 107 # Open sequence file and get sequence for feature/interval. | 130 # Open sequence file and get sequence for feature/interval. |
| 108 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): | 131 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): |
| 109 if chrom in nibs: | 132 if chrom in nibs: |
| 110 nib = nibs[chrom] | 133 nib = nibs[chrom] |
| 111 else: | 134 else: |
| 112 nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) | 135 nibs[chrom] = nib = bx.seq.nib.NibFile( |
| 136 open("%s/%s.nib" % (seq_path, chrom)) | |
| 137 ) | |
| 113 try: | 138 try: |
| 114 sequence = nib.get(start, end - start) | 139 sequence = nib.get(start, end - start) |
| 115 except Exception as e: | 140 except Exception: |
| 116 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) | 141 warning = ( |
| 142 "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " | |
| 143 % (start, end - start, args.genome) | |
| 144 ) | |
| 117 warnings.append(warning) | 145 warnings.append(warning) |
| 118 if not invalid_lines: | 146 if not invalid_lines: |
| 119 invalid_lines = egdu.get_lines(feature) | 147 invalid_lines = egdu.get_lines(feature) |
| 120 first_invalid_line = line_count | 148 first_invalid_line = line_count |
| 121 skipped_lines += len(invalid_lines) | 149 skipped_lines += len(invalid_lines) |
| 122 continue | 150 continue |
| 123 elif os.path.isfile(seq_path): | 151 elif os.path.isfile(seq_path): |
| 124 if not(twobitfile): | 152 if not (twobitfile): |
| 125 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) | 153 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) |
| 126 try: | 154 try: |
| 127 if interpret_features: | 155 if interpret_features: |
| 128 # Create sequence from intervals within a feature. | 156 # Create sequence from intervals within a feature. |
| 129 sequence = '' | 157 sequence = "" |
| 130 for interval in feature.intervals: | 158 for interval in feature.intervals: |
| 131 sequence += twobitfile[interval.chrom][interval.start:interval.end] | 159 sequence += twobitfile[interval.chrom][ |
| 160 interval.start: interval.end | |
| 161 ] | |
| 132 else: | 162 else: |
| 133 sequence = twobitfile[chrom][start:end] | 163 sequence = twobitfile[chrom][start:end] |
| 134 except Exception: | 164 except Exception: |
| 135 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) | 165 warning = ( |
| 166 "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " | |
| 167 % (start, end - start, chrom) | |
| 168 ) | |
| 136 warnings.append(warning) | 169 warnings.append(warning) |
| 137 if not invalid_lines: | 170 if not invalid_lines: |
| 138 invalid_lines = egdu.get_lines(feature) | 171 invalid_lines = egdu.get_lines(feature) |
| 139 first_invalid_line = line_count | 172 first_invalid_line = line_count |
| 140 skipped_lines += len(invalid_lines) | 173 skipped_lines += len(invalid_lines) |
| 141 continue | 174 continue |
| 142 else: | 175 else: |
| 143 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) | 176 warning = "Chromosome by name '%s' was not found for build '%s'. " % ( |
| 177 chrom, | |
| 178 args.genome, | |
| 179 ) | |
| 144 warnings.append(warning) | 180 warnings.append(warning) |
| 145 if not invalid_lines: | 181 if not invalid_lines: |
| 146 invalid_lines = egdu.get_lines(feature) | 182 invalid_lines = egdu.get_lines(feature) |
| 147 first_invalid_line = line_count | 183 first_invalid_line = line_count |
| 148 skipped_lines += len(invalid_lines) | 184 skipped_lines += len(invalid_lines) |
| 149 continue | 185 continue |
| 150 if sequence == '': | 186 if sequence == "": |
| 151 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) | 187 warning = ( |
| 188 "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " | |
| 189 % (chrom, start, end, args.genome) | |
| 190 ) | |
| 152 warnings.append(warning) | 191 warnings.append(warning) |
| 153 if not invalid_lines: | 192 if not invalid_lines: |
| 154 invalid_lines = egdu.get_lines(feature) | 193 invalid_lines = egdu.get_lines(feature) |
| 155 first_invalid_line = line_count | 194 first_invalid_line = line_count |
| 156 skipped_lines += len(invalid_lines) | 195 skipped_lines += len(invalid_lines) |
| 159 sequence = egdu.reverse_complement(sequence) | 198 sequence = egdu.reverse_complement(sequence) |
| 160 if args.output_format == "fasta": | 199 if args.output_format == "fasta": |
| 161 if input_is_gff: | 200 if input_is_gff: |
| 162 start, end = egdu.convert_bed_coords_to_gff([start, end]) | 201 start, end = egdu.convert_bed_coords_to_gff([start, end]) |
| 163 if args.fasta_header_type == "bedtools_getfasta_default": | 202 if args.fasta_header_type == "bedtools_getfasta_default": |
| 164 out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom), | 203 out.write( |
| 165 str(start), | 204 ">%s\n" |
| 166 str(end), | 205 % egdu.get_bedtools_getfasta_default_header( |
| 167 strand, | 206 str(chrom), str(start), str(end), strand, includes_strand_col |
| 168 includes_strand_col)) | 207 ) |
| 208 ) | |
| 169 else: | 209 else: |
| 170 # args.fasta_header_type == "char_delimited": | 210 # args.fasta_header_type == "char_delimited": |
| 171 fields = [args.genome, str(chrom), str(start), str(end), strand] | 211 fields = [args.genome, str(chrom), str(start), str(end), strand] |
| 172 field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter) | 212 field_delimiter = egdu.get_fasta_header_delimiter( |
| 213 args.fasta_header_delimiter | |
| 214 ) | |
| 173 meta_data = field_delimiter.join(fields) | 215 meta_data = field_delimiter.join(fields) |
| 174 if name.strip(): | 216 if name.strip(): |
| 175 out.write(">%s %s\n" % (meta_data, name)) | 217 out.write(">%s %s\n" % (meta_data, name)) |
| 176 else: | 218 else: |
| 177 out.write(">%s\n" % meta_data) | 219 out.write(">%s\n" % meta_data) |
| 182 out.write("%s\n" % str(sequence[c:b])) | 224 out.write("%s\n" % str(sequence[c:b])) |
| 183 c = b | 225 c = b |
| 184 else: | 226 else: |
| 185 # output_format == "interval". | 227 # output_format == "interval". |
| 186 if interpret_features: | 228 if interpret_features: |
| 187 meta_data = "\t".join([feature.chrom, | 229 meta_data = "\t".join( |
| 188 "galaxy_extract_genomic_dna", | 230 [ |
| 189 "interval", | 231 feature.chrom, |
| 190 str(feature.start), | 232 "galaxy_extract_genomic_dna", |
| 191 str(feature.end), | 233 "interval", |
| 192 feature.score, | 234 str(feature.start), |
| 193 feature.strand, | 235 str(feature.end), |
| 194 ".", | 236 feature.score, |
| 195 egdu.gff_attributes_to_str(feature.attributes, "GTF")]) | 237 feature.strand, |
| 238 ".", | |
| 239 egdu.gff_attributes_to_str(feature.attributes, "GTF"), | |
| 240 ] | |
| 241 ) | |
| 196 else: | 242 else: |
| 197 # Here fields was set up around line 73. | 243 # Here fields was set up around line 73. |
| 198 meta_data = "\t".join(fields) | 244 meta_data = "\t".join(fields) |
| 199 if input_is_gff: | 245 if input_is_gff: |
| 200 format_str = "%s seq \"%s\";\n" | 246 format_str = '%s seq "%s";\n' |
| 201 else: | 247 else: |
| 202 format_str = "%s\t%s\n" | 248 format_str = "%s\t%s\n" |
| 203 out.write(format_str % (meta_data, str(sequence))) | 249 out.write(format_str % (meta_data, str(sequence))) |
| 204 # Update line count. | 250 # Update line count. |
| 205 if isinstance(feature, egdu.GFFFeature): | 251 if isinstance(feature, egdu.GFFFeature): |
| 212 warn_msg = "%d warnings, 1st is: " % len(warnings) | 258 warn_msg = "%d warnings, 1st is: " % len(warnings) |
| 213 warn_msg += warnings[0] | 259 warn_msg += warnings[0] |
| 214 print(warn_msg) | 260 print(warn_msg) |
| 215 if skipped_lines: | 261 if skipped_lines: |
| 216 # Error message includes up to the first 10 skipped lines. | 262 # Error message includes up to the first 10 skipped lines. |
| 217 print('Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))) | 263 print( |
| 264 'Skipped %d invalid lines, 1st is #%d, "%s"' | |
| 265 % (skipped_lines, first_invalid_line, "\n".join(invalid_lines[:10])) | |
| 266 ) | |
| 218 | 267 |
| 219 if args.reference_genome_source == "history": | 268 if args.reference_genome_source == "history": |
| 220 os.remove(seq_path) | 269 os.remove(seq_path) |
