Mercurial > repos > tduigou > seq_from_db
changeset 2:11a3752feb0a draft default tip
planemo upload for repository https://github.com/brsynth/galaxytools/tree/main/tools commit 7f5d8b62d749a0c41110cd9c04e0254e4fd44893-dirty
| author | tduigou |
|---|---|
| date | Wed, 15 Oct 2025 12:33:41 +0000 |
| parents | 7680420caf9f |
| children | |
| files | get_db_info.py seq_form_db.xml test-data/test_missing_input.csv |
| diffstat | 3 files changed, 129 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/get_db_info.py Wed Jul 23 09:44:50 2025 +0000 +++ b/get_db_info.py Wed Oct 15 12:33:41 2025 +0000 @@ -93,7 +93,7 @@ raise Exception("Database connection failed after timeout.") -def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output): +def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output, output_report): """Fetch annotations from the database and save the result as GenBank files.""" db_uri = fix_db_uri(db_uri) df = pd.read_csv(csv_file, sep=',', header=None) @@ -130,10 +130,14 @@ db_fragments = set(fragment_map.keys()) missing_fragments = sorted(list(csv_fragments - db_fragments)) - if missing_fragments: - raise ValueError( - f" Missing fragments in DB: {', '.join(missing_fragments)}" - ) + + # Write report file + with open(output_report, "w") as report_file: + if missing_fragments: + for frag in missing_fragments: + report_file.write(f"{frag}\n") + else: + report_file.write("") # === CONTINUE WITH GB FILE CREATION === for _, row in df.iterrows(): @@ -164,8 +168,14 @@ try: for annotated_row in annotated_data: backbone_id = annotated_row["Backbone"] + for fragment in annotated_row["Fragments"]: fragment_id = fragment["id"] + + # Skip generation for missing fragments + if fragment_id in missing_fragments: + continue + sequence = fragment.get(sequence_column, "") annotation = fragment.get(annotation_columns, "") @@ -182,7 +192,7 @@ k: str(fragment[k]) for k in annotation_columns if k in fragment } - # LOCUS line extraction from annotation (copy-paste the LOCUS from annotation) + # LOCUS line extraction from annotation locus_line_match = re.search(r"LOCUS\s+.+", annotation) if locus_line_match: locus_line = locus_line_match.group() @@ -190,41 +200,36 @@ print(f"LOCUS info missing for fragment {fragment_id}") locus_line = f"LOCUS {fragment_id: <20} {len(sequence)} bp DNA linear UNK 01-JAN-2025" - # Format sequence as per GenBank standards (with ORIGIN and line breaks) + # Format sequence if "ORIGIN" in sequence: origin_block = sequence.strip() else: - # Format sequence as per GenBank standards (with ORIGIN and line breaks) formatted_sequence = "ORIGIN\n" seq_str = str(record.seq) - for i in range(0, len(seq_str), 60): # 60 bases per line + for i in range(0, len(seq_str), 60): line_seq = seq_str[i:i + 60] formatted_sequence += f"{str(i + 1).rjust(9)} { ' '.join([line_seq[j:j+10] for j in range(0, len(line_seq), 10)]) }\n" origin_block = formatted_sequence.strip() - # Find and copy the FEATURES section directly from annotation + # Extract FEATURES section features_section = "" features_start = annotation.find("FEATURES") if features_start != -1: features_section = annotation[features_start:] - # Writing the GenBank file + # Write GenBank file if not os.path.exists(output): os.makedirs(output) gb_filename = os.path.join(output, f"{fragment_id}.gb") with open(gb_filename, "w") as f: - # Write the LOCUS line f.write(locus_line + "\n") - # Write DEFINITION, ACCESSION, and other annotations f.write(f"DEFINITION {record.description}\n") f.write(f"ACCESSION {record.id}\n") f.write(f"VERSION DB\n") f.write(f"KEYWORDS .\n") f.write(f"SOURCE .\n") - # Write the FEATURES section directly from annotation f.write(features_section) - # Write the ORIGIN section f.write(origin_block + "\n") f.write("//\n") @@ -244,6 +249,7 @@ parser.add_argument("--fragment_column", required=False, help="Fragment column name in the database") parser.add_argument("--output", required=True, help="Output dir for gb files") parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters") + parser.add_argument("--report", required=True, help="Output report for fragments checking in DB") args = parser.parse_args() # get param and chek for json @@ -287,7 +293,7 @@ time.sleep(2) # Fetch annotations from the database and save as gb - fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output) + fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output, args.report) if __name__ == "__main__": main()
--- a/seq_form_db.xml Wed Jul 23 09:44:50 2025 +0000 +++ b/seq_form_db.xml Wed Oct 15 12:33:41 2025 +0000 @@ -1,8 +1,8 @@ <tool id="seq_form_db" name="Get sequences Data From DB" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.09"> <description>Import fragment's data from an accessible DB and export it as .gb files</description> <macros> - <token name="@VERSION_SUFFIX@">1</token> - <token name="@TOOL_VERSION@">0.2.0</token> + <token name="@VERSION_SUFFIX@">2</token> + <token name="@TOOL_VERSION@">0.3.0</token> </macros> <requirements> <requirement type="package" version="2.2.3">pandas</requirement> @@ -25,6 +25,7 @@ --json_conf '$json_use.json_conf' #end if --output 'outdir' + --report '$report' ]]></command> <inputs> <param name="input" type="data" format="csv" label="Input CSV File" /> @@ -46,6 +47,7 @@ <collection name="output_gb" type="list" label="GenBank Files collection" > <discover_datasets pattern="(?P<name>.*).gb" format="genbank" directory="outdir" /> </collection> + <data name='report' format='txt' label='missing fragments' /> </outputs> <tests> <!--manual parameters--> @@ -121,6 +123,11 @@ </assert_contents> </element> </output_collection> + <output name='report'> + <assert_contents> + <has_n_lines n="0" /> + </assert_contents> + </output> </test> <!--JSON parameters--> <test> @@ -191,6 +198,96 @@ </assert_contents> </element> </output_collection> + <output name='report'> + <assert_contents> + <has_n_lines n="0" /> + </assert_contents> + </output> + </test> + <!--test missing fragments--> + <test> + <param name="input" value="test_missing_input.csv" /> + <conditional name="json_use"> + <param name='use_json_paramers' value='false' /> + <param name="table" value="sample" /> + <param name="sequence_column" value="sequence" /> + <param name="annotation_columns" value="annotation" /> + <param name="fragment_column" value="fragment" /> + <param name="db_uri" value="postgresql://postgres:RK17@localhost:5432/test_fragments_db" /> + </conditional> + <output_collection name="output_gb" type="list" count="12"> + <element name="part_A"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_B"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_C"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_D"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_E"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_F"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_G"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_H"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_I"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_J"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_K"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + <element name="part_L"> + <assert_contents> + <has_n_lines min="10" /> + </assert_contents> + </element> + </output_collection> + <output name='report'> + <assert_contents> + <has_n_lines n="6" /> + <has_line_matching expression="ACP10001AaCbbBS" /> + <has_line_matching expression="NEW20001BbDccKT" /> + <has_line_matching expression="XYZ10003AaCbbBS" /> + <has_line_matching expression="CFP10002AaCbbBS" /> + <has_line_matching expression="ALT30005CcEddLM" /> + <has_line_matching expression="QWE10004AaCbbBS" /> + </assert_contents> + </output> </test> </tests>
--- a/test-data/test_missing_input.csv Wed Jul 23 09:44:50 2025 +0000 +++ b/test-data/test_missing_input.csv Wed Oct 15 12:33:41 2025 +0000 @@ -1,4 +1,8 @@ -Sample-1,ACP10001AaCbbBS,NEW20001BbDccKT,XYZ10003AaCbbBS -Sample-2,CFP10002AaCbbBS,ACP10001AaCbbBS,ALT30005CcEddLM -Sample-3,XYZ10003AaCbbBS,ALT30005CcEddLM,ACP10001AaCbbBS -Sample-4,QWE10004AaCbbBS,NEW20001BbDccKT,CFP10002AaCbbBS +Sample-1,ACP10001AaCbbBS,NEW20001BbDccKT,XYZ10003AaCbbBS,,, +Sample-2,CFP10002AaCbbBS,ACP10001AaCbbBS,ALT30005CcEddLM,,, +Sample-3,XYZ10003AaCbbBS,ALT30005CcEddLM,ACP10001AaCbbBS,,, +Sample-4,QWE10004AaCbbBS,NEW20001BbDccKT,CFP10002AaCbbBS,,, +construct_3,construct_1,part_L,part_J,part_K,, +construct_4,construct_2,part_L,part_J,part_K,, +construct_1,part_A,part_B,part_C,part_D,part_E,part_F +construct_2,part_A,part_G,part_H,part_I,part_F,
