Mercurial > repos > artbio > lumpy_smoove
view vcf2hrdetect.py @ 11:5a326a6fa105 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/lumpy_smoove commit 8b10e8fc832f8ca7c32479e20d5edbd62088a3aa
author | artbio |
---|---|
date | Fri, 17 Oct 2025 17:21:17 +0000 |
parents | 7dcf61950215 |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import re import sys def create_arg_parser(): """Creates and returns the argument parser.""" parser = argparse.ArgumentParser( description=( "Convert a VCF file from lumpy-smoove to a tabular format " "compatible with the HRDetect pipeline." ) ) parser.add_argument( 'vcf_file', help='Path to the input VCF file.' ) parser.add_argument( 'output_file', help='Path to the output tabular file.' ) return parser def parse_breakend_alt(alt_field): """ Parses the ALT field for a breakend and returns chromosome and position. Args: alt_field (str): The ALT field (column 5) of a VCF line. Returns: tuple: A tuple containing (chromosome, position) or (None, None) if parsing fails. """ # Search for patterns ]chr:pos] or [chr:pos[ pattern = ( r"\](?P<chrom1>[^:]+):(?P<pos1>\d+)\]|" r"\[(?P<chrom2>[^:]+):(?P<pos2>\d+)\[" ) match = re.search(pattern, alt_field) if not match: return None, None groups = match.groupdict() chrom = groups['chrom1'] or groups['chrom2'] pos = groups['pos1'] or groups['pos2'] return chrom, pos def process_vcf(vcf_path, output_path): """ Reads a VCF file, converts it, and writes the result to a tabular file. Args: vcf_path (str): Path to the input VCF file. output_path (str): Path to the output tabular file. """ header = ["chr1", "pos1", "chr2", "pos2", "type"] try: with open(vcf_path, 'r') as infile, open(output_path, 'w') as outfile: outfile.write("\t".join(header) + "\n") for line in infile: if line.startswith('#'): continue fields = line.strip().split('\t') if len(fields) < 8: continue chrom1 = fields[0] pos1 = fields[1] info = fields[7] # Attempt to extract the structural variant type from the info svtype_match = re.search(r'SVTYPE=([^;]+)', info) if not svtype_match: continue # Skip lines without SVTYPE tag svtype = svtype_match.group(1) if svtype == "BND": # Breakend (INV or TRA) alt_field = fields[4] chrom2, pos2 = parse_breakend_alt(alt_field) if not (chrom2 and pos2): continue event_type = "INV" if chrom1 == chrom2 else "TRA" row = [chrom1, pos1, chrom2, pos2, event_type] outfile.write("\t".join(row) + "\n") else: # Other SV types (DEL, DUP, etc.) end_match = re.search(r'END=([^;]+)', info) if not end_match: continue pos2 = end_match.group(1) chrom2 = chrom1 row = [chrom1, pos1, chrom2, pos2, svtype] outfile.write("\t".join(row) + "\n") except FileNotFoundError: print(f"Error: File '{vcf_path}' not found.", file=sys.stderr) sys.exit(1) except IOError as e: print(f"IO Error: {e}", file=sys.stderr) sys.exit(1) def main(): """Main function of the script.""" parser = create_arg_parser() args = parser.parse_args() process_vcf(args.vcf_file, args.output_file) if __name__ == '__main__': main()