extract_genomic_dna: extract_genomic

comparison extract_genomic_dna.py @ 11:80414c33a59a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit 6db2d98b513e4980788fcba49d809c91e5750296

author	iuc
date	Thu, 21 Nov 2024 07:20:29 +0000
parents	e400dcbc60d0
children

comparison

equal deleted inserted replaced

-:5cc8e93ee98f
+:80414c33a59a
 from bx.intervals.io import Comment, Header
 import extract_genomic_dna_utils as egdu  # noqa: I100,I202
 parser = argparse.ArgumentParser()
-parser.add_argument('--input_format', dest='input_format', help="Input dataset format")
+parser.add_argument("--input_format", dest="input_format", help="Input dataset format")
-parser.add_argument('--input', dest='input', help="Input dataset")
+parser.add_argument("--input", dest="input", help="Input dataset")
-parser.add_argument('--genome', dest='genome', help="Input dataset genome build")
+parser.add_argument("--genome", dest="genome", help="Input dataset genome build")
-parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff")
+parser.add_argument(
-parser.add_argument('--columns', dest='columns', help="Columns to use in input file")
+"--interpret_features",
-parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file")
+dest="interpret_features",
-parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file")
+default=None,
-parser.add_argument('--output_format', dest='output_format', help="Output format")
+help="Interpret features if input format is gff",
-parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format")
+)
-parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter")
+parser.add_argument("--columns", dest="columns", help="Columns to use in input file")
-parser.add_argument('--output', dest='output', help="Output dataset")
+parser.add_argument(
+"--reference_genome_source",
+dest="reference_genome_source",
+help="Source of reference genome file",
+)
+parser.add_argument(
+"--reference_genome", dest="reference_genome", help="Reference genome file"
+)
+parser.add_argument("--output_format", dest="output_format", help="Output format")
+parser.add_argument(
+"--fasta_header_type",
+dest="fasta_header_type",
+default=None,
+help="Fasta header format",
+)
+parser.add_argument(
+"--fasta_header_delimiter",
+dest="fasta_header_delimiter",
+default=None,
+help="Fasta header field delimiter",
+)
+parser.add_argument("--output", dest="output", help="Output dataset")
 args = parser.parse_args()
-input_is_gff = args.input_format == 'gff'
+input_is_gff = args.input_format == "gff"
 interpret_features = input_is_gff and args.interpret_features == "yes"
-if len(args.columns.split(',')) == 5:
+if len(args.columns.split(",")) == 5:
 # Bed file.
-chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns)
+chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(
+args.columns
+)
 else:
 # Gff file.
 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns)
 name_col = False
 nibs = {}
 skipped_lines = 0
 first_invalid_line = 0
 invalid_lines = []
 warnings = []
-warning = ''
+warning = ""
 twobitfile = None
 line_count = 1
 file_iterator = open(args.input)
 if interpret_features:
 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False)
-out = open(args.output, 'wt')
+out = open(args.output, "wt")
 for feature in file_iterator:
 # Ignore comments, headers.
 if isinstance(feature, (Header, Comment)):
 line_count += 1
 start = feature.start
 end = feature.end
 strand = feature.strand
 else:
 # Processing lines, either interval or GFF format.
-line = feature.rstrip('\r\n')
+line = feature.rstrip("\r\n")
 if line and not line.startswith("#"):
-fields = line.split('\t')
+fields = line.split("\t")
 try:
 chrom = fields[chrom_col]
 start = int(fields[start_col])
 end = int(fields[end_col])
 if name_col:
 if not invalid_lines:
 invalid_lines = egdu.get_lines(feature)
 first_invalid_line = line_count
 skipped_lines += len(invalid_lines)
 continue
-if strand not in ['+', '-']:
+if strand not in ["+", "-"]:
-strand = '+'
+strand = "+"
-sequence = ''
+sequence = ""
 else:
 continue
 # Open sequence file and get sequence for feature/interval.
 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)):
 if chrom in nibs:
 nib = nibs[chrom]
 else:
-nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom)))
+nibs[chrom] = nib = bx.seq.nib.NibFile(
+open("%s/%s.nib" % (seq_path, chrom))
+)
 try:
 sequence = nib.get(start, end - start)
-except Exception as e:
+except Exception:
-warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome)
+warning = (
+"Unable to fetch the sequence from '%d' to '%d' for build '%s'. "
+% (start, end - start, args.genome)
+)
 warnings.append(warning)
 if not invalid_lines:
 invalid_lines = egdu.get_lines(feature)
 first_invalid_line = line_count
 skipped_lines += len(invalid_lines)
 continue
 elif os.path.isfile(seq_path):
-if not(twobitfile):
+if not (twobitfile):
 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
 try:
 if interpret_features:
 # Create sequence from intervals within a feature.
-sequence = ''
+sequence = ""
 for interval in feature.intervals:
-sequence += twobitfile[interval.chrom][interval.start:interval.end]
+sequence += twobitfile[interval.chrom][
+interval.start: interval.end
+]
 else:
 sequence = twobitfile[chrom][start:end]
 except Exception:
-warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom)
+warning = (
+"Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. "
+% (start, end - start, chrom)
+)
 warnings.append(warning)
 if not invalid_lines:
 invalid_lines = egdu.get_lines(feature)
 first_invalid_line = line_count
 skipped_lines += len(invalid_lines)
 continue
 else:
-warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome)
+warning = "Chromosome by name '%s' was not found for build '%s'. " % (
+chrom,
+args.genome,
+)
 warnings.append(warning)
 if not invalid_lines:
 invalid_lines = egdu.get_lines(feature)
 first_invalid_line = line_count
 skipped_lines += len(invalid_lines)
 continue
-if sequence == '':
+if sequence == "":
-warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome)
+warning = (
+"Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. "
+% (chrom, start, end, args.genome)
+)
 warnings.append(warning)
 if not invalid_lines:
 invalid_lines = egdu.get_lines(feature)
 first_invalid_line = line_count
 skipped_lines += len(invalid_lines)
 sequence = egdu.reverse_complement(sequence)
 if args.output_format == "fasta":
 if input_is_gff:
 start, end = egdu.convert_bed_coords_to_gff([start, end])
 if args.fasta_header_type == "bedtools_getfasta_default":
-out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom),
+out.write(
-str(start),
+">%s\n"
-str(end),
+% egdu.get_bedtools_getfasta_default_header(
-strand,
+str(chrom), str(start), str(end), strand, includes_strand_col
-includes_strand_col))
+)
+)
 else:
 # args.fasta_header_type == "char_delimited":
 fields = [args.genome, str(chrom), str(start), str(end), strand]
-field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter)
+field_delimiter = egdu.get_fasta_header_delimiter(
+args.fasta_header_delimiter
+)
 meta_data = field_delimiter.join(fields)
 if name.strip():
 out.write(">%s %s\n" % (meta_data, name))
 else:
 out.write(">%s\n" % meta_data)
 out.write("%s\n" % str(sequence[c:b]))
 c = b
 else:
 # output_format == "interval".
 if interpret_features:
-meta_data = "\t".join([feature.chrom,
+meta_data = "\t".join(
-"galaxy_extract_genomic_dna",
+[
-"interval",
+feature.chrom,
-str(feature.start),
+"galaxy_extract_genomic_dna",
-str(feature.end),
+"interval",
-feature.score,
+str(feature.start),
-feature.strand,
+str(feature.end),
-".",
+feature.score,
-egdu.gff_attributes_to_str(feature.attributes, "GTF")])
+feature.strand,
+".",
+egdu.gff_attributes_to_str(feature.attributes, "GTF"),
+]
+)
 else:
 # Here fields was set up around line 73.
 meta_data = "\t".join(fields)
 if input_is_gff:
-format_str = "%s seq \"%s\";\n"
+format_str = '%s seq "%s";\n'
 else:
 format_str = "%s\t%s\n"
 out.write(format_str % (meta_data, str(sequence)))
 # Update line count.
 if isinstance(feature, egdu.GFFFeature):
 warn_msg = "%d warnings, 1st is: " % len(warnings)
 warn_msg += warnings[0]
 print(warn_msg)
 if skipped_lines:
 # Error message includes up to the first 10 skipped lines.
-print('Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10])))
+print(
+'Skipped %d invalid lines, 1st is #%d, "%s"'
+% (skipped_lines, first_invalid_line, "\n".join(invalid_lines[:10]))
+)
 if args.reference_genome_source == "history":
 os.remove(seq_path)

Mercurial > repos > iuc > extract_genomic_dna

comparison extract_genomic_dna.py @ 11:80414c33a59a draft default tip