changeset 0:1bc09d59c6d7 draft

Uploaded
author dcouvin
date Fri, 03 Sep 2021 22:33:14 +0000
parents
children d22e2cd2dab5
files SRArunInfo.pl SRArunInfo.xml
diffstat 2 files changed, 168 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SRArunInfo.pl	Fri Sep 03 22:33:14 2021 +0000
@@ -0,0 +1,131 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+#my $start = time();
+
+################################################################
+# Script allowing to get SRA run information using accession ID
+# wget command is required to use this script
+################################################################
+
+# example of use: perl SRArunInfo.pl SRR7693877,SRR9850824,SRR9850830 OR perl SRArunInfo.pl list_accessions.txt
+
+# options
+
+print "Welcome to SRA run Info\n";
+
+my $runs = $ARGV[0];
+
+my @tabRuns = ();
+
+if($ARGV[0] =~  m/,/ ){
+  @tabRuns = split (/,/, $runs);
+}
+elsif(-e $runs){ 
+  open my $handle, '<', $runs;
+  chomp(@tabRuns = <$handle>);
+  close $handle;
+}
+else{
+  push(@tabRuns, $runs);
+}
+
+#@tabRuns = split (/,/, $runs) ;
+
+my $summary = $ARGV[1];  #"summary_Runs.tsv";
+my $country2 = "";
+my @tabCSV = ();
+my %hashCenter = ();
+
+# Center names (e.g. abbreviation SC means: "The Wellcome Trust Sanger Institute")
+$hashCenter{"SC"} = "The Wellcome Trust Sanger Institute";
+$hashCenter{"BI"} = "Broad Institute";
+
+open (SUM, ">$summary") or die "open : $!";
+
+print SUM "Run\tRelease_Date\tBases (bp)\tAssembly_Name\tTaxonomyID\tScientific_Name (or species)\tCenter_Name\tConsent\tCountry\tLibrary_Strategy\tLibrary_Selection\tLibrary_Source\tLibrary_Layout\tPlatform\tModel\n";
+
+for my $run (@tabRuns){
+  #$str='String 1GIANT FISHString 2'
+  #($country)= $str =~ /String 1(.*)String 2/
+
+  my $first_Cmd = "wget -q -O ./$run.csv 'http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=$run' ";
+  
+  my $second_Cmd = "wget -q -O ./$run.html \"https://www.ncbi.nlm.nih.gov//sra?term=($run)%20NOT%20cluster_dbgap%5BPROP%5D&report=FullXml\" ";
+
+  system($first_Cmd);
+  system($second_Cmd);
+
+  open (XML, "<$run.html") or die "open : $!";
+  while (<XML>) {
+    chomp();
+    my $string1 = "Country&gt;</b>";
+    my $string2 = "<b>";
+    $_ =~ /$string1(.*?)$string2/; #/<div class="xml-tag"><b>\&lt;Country\&gt;<\/b>(.*)<b>\&lt;/;
+    if($1){
+      $country2 = $1; # my $country $country[0];
+    }
+    else{
+      $country2 = "ND";
+    }
+    #open COUNTRY, "echo $_ | grep -oP '(?<=Country&gt;</b>).*?(?=<b>)' "; # Non-greedy match (Notice the '?' after '*' in .*)
+    #while (<COUNTRY>) {
+    #  chomp();
+    #  print $_;
+    #}
+    #my ($substr) = ($string =~ /period_1_(.*)\.ssa/);
+  }
+  close (XML) or die "close file error : $!";
+
+  open (RUNXML, "<$run.html") or die "open : $!";
+  if ($country2 eq "ND"){
+   while (<RUNXML>) {
+    chomp();
+    my $ostring1 = "</b>geographic location (country and/or sea  region)<b>&lt;/TAG&gt;</b></div><div class=\"xml-tag\"><b>&lt;VALUE&gt;</b>";
+    my $ostring2 = "<b>&lt;";
+    $_ =~ /$ostring1(.*?)$ostring2/; #/<div class="xml-tag"><b>\&lt;Country\&gt;<\/b>(.*)<b>\&lt;/;
+    if($1){
+      print "Country2 = ".$1."\n";
+      $country2 = $1; # my $country $country[0];
+    }
+    else{
+      $country2 = "ND";
+    }
+    
+  }
+  }
+  close (RUNXML) or die "close file error : $!";
+
+  #</b>geographic location (country and/or sea  region)<b>&lt;/TAG&gt;</b></div><div class="xml-tag"><b>&lt;VALUE&gt;</b>
+  #<b>&lt;
+  #close (COUNTRY) or die "close file error : $!";
+
+  open (CSV, "<$run.csv") or die "open : $!";
+  while (<CSV>) {
+    chomp();
+    if ($_ =~  m/$run/) {
+      @tabCSV = split (/,/, $_) ;
+    }
+  }
+  close (CSV) or die "close file error : $!";
+
+  my $tmpCenter = "";
+  if($hashCenter{$tabCSV[41]}) { $tmpCenter = $hashCenter{$tabCSV[41]}; }
+
+  print SUM "$run\t$tabCSV[1]\t$tabCSV[4]\t$tabCSV[8]\t$tabCSV[27]\t$tabCSV[28]\t$tabCSV[41] ($tmpCenter)\t$tabCSV[44]\t$country2\t$tabCSV[12]\t$tabCSV[13]\t$tabCSV[14]\t$tabCSV[15]\t$tabCSV[18]\t$tabCSV[19]\n";
+
+}
+
+
+close (SUM) or die "close file error : $!";
+
+#my $end = time();
+
+#my $total = $end - $start;
+
+#print "***** Total time (in seconds) is: $total *****\n";
+unlink glob ('*.html');
+unlink glob ('*.csv');
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SRArunInfo.xml	Fri Sep 03 22:33:14 2021 +0000
@@ -0,0 +1,37 @@
+<tool id="sra_run_info" name="SRA run Info" version="1.0">
+  <description>Provide information from SRR/ERR run accessions</description>
+ <!-- <requirements>
+  <requirement type="package" version=""></requirement>
+</requirements> -->
+    <command detect_errors="aggressive"><![CDATA[
+
+	perl ${__tool_directory__}/SRArunInfo.pl $input $output
+
+	]]></command>
+
+
+
+ <inputs>
+    <param name="input" type="text" area="true" value='SRR7693877,SRR9850824,SRR9850830' label="SRR/ERR accession number" help="Comma separated list of IDs (without spaces between IDs)"/>
+    
+  <!--<param format="tabular" name="input_files" type="data" multiple="true" label="Annotation files"/>-->
+ </inputs>
+
+ <outputs>
+	 <data format="tabular" name="output" label="Summary file"/>
+	 <!--<data format="tabular" name="output2" label="HTML file"/>-->
+ </outputs>
+
+ <help><![CDATA[
+SRArunInfo.pl is a Perl script allowing to provide information from SRR/ERR run accessions
+
+This script belongs to the getSequenceInfo supplementary tools
+
+Example of input: SRR7693877,SRR9850824,SRR9850830
+
+- GitHub: https://github.com/karubiotools/getSequenceInfo/tree/master/supplementary_tools
+]]>
+ </help>
+
+
+</tool>