# HG changeset patch # User dcouvin # Date 1630708394 0 # Node ID 1bc09d59c6d76fc47fe9a85b2056a0cf889aff4b Uploaded diff -r 000000000000 -r 1bc09d59c6d7 SRArunInfo.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SRArunInfo.pl Fri Sep 03 22:33:14 2021 +0000 @@ -0,0 +1,131 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +#my $start = time(); + +################################################################ +# Script allowing to get SRA run information using accession ID +# wget command is required to use this script +################################################################ + +# example of use: perl SRArunInfo.pl SRR7693877,SRR9850824,SRR9850830 OR perl SRArunInfo.pl list_accessions.txt + +# options + +print "Welcome to SRA run Info\n"; + +my $runs = $ARGV[0]; + +my @tabRuns = (); + +if($ARGV[0] =~ m/,/ ){ + @tabRuns = split (/,/, $runs); +} +elsif(-e $runs){ + open my $handle, '<', $runs; + chomp(@tabRuns = <$handle>); + close $handle; +} +else{ + push(@tabRuns, $runs); +} + +#@tabRuns = split (/,/, $runs) ; + +my $summary = $ARGV[1]; #"summary_Runs.tsv"; +my $country2 = ""; +my @tabCSV = (); +my %hashCenter = (); + +# Center names (e.g. abbreviation SC means: "The Wellcome Trust Sanger Institute") +$hashCenter{"SC"} = "The Wellcome Trust Sanger Institute"; +$hashCenter{"BI"} = "Broad Institute"; + +open (SUM, ">$summary") or die "open : $!"; + +print SUM "Run\tRelease_Date\tBases (bp)\tAssembly_Name\tTaxonomyID\tScientific_Name (or species)\tCenter_Name\tConsent\tCountry\tLibrary_Strategy\tLibrary_Selection\tLibrary_Source\tLibrary_Layout\tPlatform\tModel\n"; + +for my $run (@tabRuns){ + #$str='String 1GIANT FISHString 2' + #($country)= $str =~ /String 1(.*)String 2/ + + my $first_Cmd = "wget -q -O ./$run.csv 'http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=$run' "; + + my $second_Cmd = "wget -q -O ./$run.html \"https://www.ncbi.nlm.nih.gov//sra?term=($run)%20NOT%20cluster_dbgap%5BPROP%5D&report=FullXml\" "; + + system($first_Cmd); + system($second_Cmd); + + open (XML, "<$run.html") or die "open : $!"; + while () { + chomp(); + my $string1 = "Country>"; + my $string2 = ""; + $_ =~ /$string1(.*?)$string2/; #/
\<Country\><\/b>(.*)\</; + if($1){ + $country2 = $1; # my $country $country[0]; + } + else{ + $country2 = "ND"; + } + #open COUNTRY, "echo $_ | grep -oP '(?<=Country>).*?(?=)' "; # Non-greedy match (Notice the '?' after '*' in .*) + #while () { + # chomp(); + # print $_; + #} + #my ($substr) = ($string =~ /period_1_(.*)\.ssa/); + } + close (XML) or die "close file error : $!"; + + open (RUNXML, "<$run.html") or die "open : $!"; + if ($country2 eq "ND"){ + while () { + chomp(); + my $ostring1 = "geographic location (country and/or sea region)</TAG>
<VALUE>"; + my $ostring2 = "<"; + $_ =~ /$ostring1(.*?)$ostring2/; #/
\<Country\><\/b>(.*)\</; + if($1){ + print "Country2 = ".$1."\n"; + $country2 = $1; # my $country $country[0]; + } + else{ + $country2 = "ND"; + } + + } + } + close (RUNXML) or die "close file error : $!"; + + #geographic location (country and/or sea region)</TAG>
<VALUE> + #< + #close (COUNTRY) or die "close file error : $!"; + + open (CSV, "<$run.csv") or die "open : $!"; + while () { + chomp(); + if ($_ =~ m/$run/) { + @tabCSV = split (/,/, $_) ; + } + } + close (CSV) or die "close file error : $!"; + + my $tmpCenter = ""; + if($hashCenter{$tabCSV[41]}) { $tmpCenter = $hashCenter{$tabCSV[41]}; } + + print SUM "$run\t$tabCSV[1]\t$tabCSV[4]\t$tabCSV[8]\t$tabCSV[27]\t$tabCSV[28]\t$tabCSV[41] ($tmpCenter)\t$tabCSV[44]\t$country2\t$tabCSV[12]\t$tabCSV[13]\t$tabCSV[14]\t$tabCSV[15]\t$tabCSV[18]\t$tabCSV[19]\n"; + +} + + +close (SUM) or die "close file error : $!"; + +#my $end = time(); + +#my $total = $end - $start; + +#print "***** Total time (in seconds) is: $total *****\n"; +unlink glob ('*.html'); +unlink glob ('*.csv'); + diff -r 000000000000 -r 1bc09d59c6d7 SRArunInfo.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SRArunInfo.xml Fri Sep 03 22:33:14 2021 +0000 @@ -0,0 +1,37 @@ + + Provide information from SRR/ERR run accessions + + + + + + + + + + + + + + + + + + + + +