comparison get_db_info.py @ 2:11a3752feb0a draft default tip

planemo upload for repository https://github.com/brsynth/galaxytools/tree/main/tools commit 7f5d8b62d749a0c41110cd9c04e0254e4fd44893-dirty
author tduigou
date Wed, 15 Oct 2025 12:33:41 +0000
parents 7680420caf9f
children
comparison
equal deleted inserted replaced
1:7680420caf9f 2:11a3752feb0a
91 print("Database not ready, retrying...") 91 print("Database not ready, retrying...")
92 time.sleep(2) 92 time.sleep(2)
93 raise Exception("Database connection failed after timeout.") 93 raise Exception("Database connection failed after timeout.")
94 94
95 95
96 def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output): 96 def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output, output_report):
97 """Fetch annotations from the database and save the result as GenBank files.""" 97 """Fetch annotations from the database and save the result as GenBank files."""
98 db_uri = fix_db_uri(db_uri) 98 db_uri = fix_db_uri(db_uri)
99 df = pd.read_csv(csv_file, sep=',', header=None) 99 df = pd.read_csv(csv_file, sep=',', header=None)
100 100
101 engine = create_engine(db_uri) 101 engine = create_engine(db_uri)
128 if fragment_str not in all_ids: 128 if fragment_str not in all_ids:
129 csv_fragments.add(fragment_str) 129 csv_fragments.add(fragment_str)
130 130
131 db_fragments = set(fragment_map.keys()) 131 db_fragments = set(fragment_map.keys())
132 missing_fragments = sorted(list(csv_fragments - db_fragments)) 132 missing_fragments = sorted(list(csv_fragments - db_fragments))
133 if missing_fragments: 133
134 raise ValueError( 134 # Write report file
135 f" Missing fragments in DB: {', '.join(missing_fragments)}" 135 with open(output_report, "w") as report_file:
136 ) 136 if missing_fragments:
137 for frag in missing_fragments:
138 report_file.write(f"{frag}\n")
139 else:
140 report_file.write("")
137 141
138 # === CONTINUE WITH GB FILE CREATION === 142 # === CONTINUE WITH GB FILE CREATION ===
139 for _, row in df.iterrows(): 143 for _, row in df.iterrows():
140 annotated_row = {"Backbone": row[0], "Fragments": []} 144 annotated_row = {"Backbone": row[0], "Fragments": []}
141 for col in df.columns: 145 for col in df.columns:
162 166
163 # GenBank file generation per fragment 167 # GenBank file generation per fragment
164 try: 168 try:
165 for annotated_row in annotated_data: 169 for annotated_row in annotated_data:
166 backbone_id = annotated_row["Backbone"] 170 backbone_id = annotated_row["Backbone"]
171
167 for fragment in annotated_row["Fragments"]: 172 for fragment in annotated_row["Fragments"]:
168 fragment_id = fragment["id"] 173 fragment_id = fragment["id"]
174
175 # Skip generation for missing fragments
176 if fragment_id in missing_fragments:
177 continue
178
169 sequence = fragment.get(sequence_column, "") 179 sequence = fragment.get(sequence_column, "")
170 annotation = fragment.get(annotation_columns, "") 180 annotation = fragment.get(annotation_columns, "")
171 181
172 # Create the SeqRecord 182 # Create the SeqRecord
173 record = SeqRecord( 183 record = SeqRecord(
180 # Add annotations to GenBank header 190 # Add annotations to GenBank header
181 record.annotations = { 191 record.annotations = {
182 k: str(fragment[k]) for k in annotation_columns if k in fragment 192 k: str(fragment[k]) for k in annotation_columns if k in fragment
183 } 193 }
184 194
185 # LOCUS line extraction from annotation (copy-paste the LOCUS from annotation) 195 # LOCUS line extraction from annotation
186 locus_line_match = re.search(r"LOCUS\s+.+", annotation) 196 locus_line_match = re.search(r"LOCUS\s+.+", annotation)
187 if locus_line_match: 197 if locus_line_match:
188 locus_line = locus_line_match.group() 198 locus_line = locus_line_match.group()
189 else: 199 else:
190 print(f"LOCUS info missing for fragment {fragment_id}") 200 print(f"LOCUS info missing for fragment {fragment_id}")
191 locus_line = f"LOCUS {fragment_id: <20} {len(sequence)} bp DNA linear UNK 01-JAN-2025" 201 locus_line = f"LOCUS {fragment_id: <20} {len(sequence)} bp DNA linear UNK 01-JAN-2025"
192 202
193 # Format sequence as per GenBank standards (with ORIGIN and line breaks) 203 # Format sequence
194 if "ORIGIN" in sequence: 204 if "ORIGIN" in sequence:
195 origin_block = sequence.strip() 205 origin_block = sequence.strip()
196 else: 206 else:
197 # Format sequence as per GenBank standards (with ORIGIN and line breaks)
198 formatted_sequence = "ORIGIN\n" 207 formatted_sequence = "ORIGIN\n"
199 seq_str = str(record.seq) 208 seq_str = str(record.seq)
200 for i in range(0, len(seq_str), 60): # 60 bases per line 209 for i in range(0, len(seq_str), 60):
201 line_seq = seq_str[i:i + 60] 210 line_seq = seq_str[i:i + 60]
202 formatted_sequence += f"{str(i + 1).rjust(9)} { ' '.join([line_seq[j:j+10] for j in range(0, len(line_seq), 10)]) }\n" 211 formatted_sequence += f"{str(i + 1).rjust(9)} { ' '.join([line_seq[j:j+10] for j in range(0, len(line_seq), 10)]) }\n"
203 origin_block = formatted_sequence.strip() 212 origin_block = formatted_sequence.strip()
204 213
205 # Find and copy the FEATURES section directly from annotation 214 # Extract FEATURES section
206 features_section = "" 215 features_section = ""
207 features_start = annotation.find("FEATURES") 216 features_start = annotation.find("FEATURES")
208 if features_start != -1: 217 if features_start != -1:
209 features_section = annotation[features_start:] 218 features_section = annotation[features_start:]
210 219
211 # Writing the GenBank file 220 # Write GenBank file
212 if not os.path.exists(output): 221 if not os.path.exists(output):
213 os.makedirs(output) 222 os.makedirs(output)
214 223
215 gb_filename = os.path.join(output, f"{fragment_id}.gb") 224 gb_filename = os.path.join(output, f"{fragment_id}.gb")
216 with open(gb_filename, "w") as f: 225 with open(gb_filename, "w") as f:
217 # Write the LOCUS line
218 f.write(locus_line + "\n") 226 f.write(locus_line + "\n")
219 # Write DEFINITION, ACCESSION, and other annotations
220 f.write(f"DEFINITION {record.description}\n") 227 f.write(f"DEFINITION {record.description}\n")
221 f.write(f"ACCESSION {record.id}\n") 228 f.write(f"ACCESSION {record.id}\n")
222 f.write(f"VERSION DB\n") 229 f.write(f"VERSION DB\n")
223 f.write(f"KEYWORDS .\n") 230 f.write(f"KEYWORDS .\n")
224 f.write(f"SOURCE .\n") 231 f.write(f"SOURCE .\n")
225 # Write the FEATURES section directly from annotation
226 f.write(features_section) 232 f.write(features_section)
227 # Write the ORIGIN section
228 f.write(origin_block + "\n") 233 f.write(origin_block + "\n")
229 f.write("//\n") 234 f.write("//\n")
230 235
231 except Exception as e: 236 except Exception as e:
232 print(f"Error saving GenBank files: {e}") 237 print(f"Error saving GenBank files: {e}")
242 parser.add_argument("--db_uri", required=False, help="Database URI connection string") 247 parser.add_argument("--db_uri", required=False, help="Database URI connection string")
243 parser.add_argument("--table", required=False, help="Table name in the database") 248 parser.add_argument("--table", required=False, help="Table name in the database")
244 parser.add_argument("--fragment_column", required=False, help="Fragment column name in the database") 249 parser.add_argument("--fragment_column", required=False, help="Fragment column name in the database")
245 parser.add_argument("--output", required=True, help="Output dir for gb files") 250 parser.add_argument("--output", required=True, help="Output dir for gb files")
246 parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters") 251 parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters")
252 parser.add_argument("--report", required=True, help="Output report for fragments checking in DB")
247 args = parser.parse_args() 253 args = parser.parse_args()
248 254
249 # get param and chek for json 255 # get param and chek for json
250 config_params = {} 256 config_params = {}
251 use_json = args.use_json_paramers == 'true' 257 use_json = args.use_json_paramers == 'true'
285 raise e 291 raise e
286 else: 292 else:
287 time.sleep(2) 293 time.sleep(2)
288 294
289 # Fetch annotations from the database and save as gb 295 # Fetch annotations from the database and save as gb
290 fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output) 296 fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output, args.report)
291 297
292 if __name__ == "__main__": 298 if __name__ == "__main__":
293 main() 299 main()