Mercurial > repos > qfabrepo > metadegalaxy_pear_stats

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pear_stats.py	Mon Sep 14 04:50:28 2020 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import sys
+import argparse
+
+parser = argparse.ArgumentParser(
+    description="Parse multiple Pear statistic log to a tabular format\n" +
+                "Example:\n python pear_stats.py -i \"file1.log,file2.log\" -s \"samplename1 samplename2\" -o outputfile")
+parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0")
+parser.add_argument("-i","--input",dest="inputfilelist",default=False,help="a list of input file")
+parser.add_argument("-s","--samplename", dest="samplename",default=False,help="a list of input filename")
+parser.add_argument("-o","--outfile",dest="outputfile",default=False,help="Pear statistic output")
+
+
+if(len(sys.argv) == 1):
+       parser.print_help(sys.stderr)
+       sys.exit()
+
+args = parser.parse_args()
+
+tags = ['Assembled reads','Discarded reads','Not assembled reads']
+LINESTART=30
+LINEEND  =LINESTART+2
+
+
+inputfiles=args.inputfilelist.split(',')
+inputfilenames=args.samplename.split(',')
+outputfile=open(args.outputfile,'w')
+
+allAssembled = 0
+
+def processfile(instr):
+	result=[]
+	with open(instr,'r') as f:
+		for linenum,line in enumerate(f):
+			if LINESTART <= linenum <= LINEEND:
+				ix = linenum-LINESTART
+				if (line.startswith(tags[ix])):
+					result.append(line.rstrip())
+					if (ix == 0):
+						token = line.strip().split('(')[1]
+						token = token.replace("%)","")
+						global allAssembled
+						allAssembled += float(token)
+					else:
+						print("ARGH!:", line)
+	return(result)
+
+for element in range(0,len(inputfiles)):
+    output=processfile(inputfiles[element])
+    output.insert(0,inputfilenames[element])
+    outputfile.write("\t".join(output))
+    outputfile.write("\n")
+
+averageAssembled = allAssembled / len(inputfiles)
+
+averageAssembledOut=["The above assessment has been performed on 1000 randomly selected reads per sample file.\nAverage % of overlapping paired-end reads =",str(averageAssembled),"\nIf the average percentage is greater than 50%, you can consider using workflow 16S_biodiversity_for_overlap_PE.\nHowever, if the average percentage is less than 50%, use 16S_biodiversity_nonoverlap_PE."]
+
+
+outputfile.write("\n\n\n")
+outputfile.write("\t".join(averageAssembledOut))
+outputfile.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pear_stats.xml	Mon Sep 14 04:50:28 2020 +0000
@@ -0,0 +1,100 @@
+<tool id="pearStat" name="PEAR Statistics" version="1.0.0">
+  <description>Generate paired-end reads overlap Statistic from PEAR log file</description>
+    <version_command>
+	    python ${__tool_directory__}/pear_stats.py --version
+	</version_command>
+  <command detect_errors="aggressive"><![CDATA[
+      #set files=""
+      #for $index,$file in enumerate($input):
+        #set $files += str($file)+","
+      #end for
+      #set $files=$files[:-1]
+
+      #set names=','.join( [ str( $name ) for $name in $input.keys() ])
+
+      python $__tool_directory__/pear_stats.py -i "$files" -s "$names" -o $output
+   ]]> </command>
+  <inputs>
+    <param name="input" type="data_collection" format="txt" collection_type="list" label="PEAR LOG FILE"/>
+  </inputs>
+  <outputs>
+    <!--<data format="tabular" name="output" label="${tool.name}.${input.display_name}"/>-->
+    <data format="tabular" name="output" label="${tool.name}.log"/>
+  </outputs>
+  <tests>
+    <test>
+	  <param name="input">
+	  	<collection type="list">
+	  		   <element name="F3D0" value="F3D0.log" />
+		   	   <element name="F3D1" value="F3D1.log" />
+      	</collection>
+	   </param>
+      <output name="output" file="test_output.txt"/>
+    </test>
+  </tests>
+
+  <help>
+** what it does **
+
+Creates summary log file by extracting "Assembled reads", "Discarded reads" and "Not assembled reads" from PEAR_ log.
+
+.. _PEAR: https://sco.h-its.org/exelixis/web/software/pear/doc.html
+
+-----
+
+=======
+Example
+=======
+
+PEAR v0.9.6 [January 15, 2015]
+
+Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
+Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
+
+==================================== =======================================
+Field                                Parameter and output
+------------------------------------ ---------------------------------------
+Forward reads file.................: /mnt/galaxy/files/014/dataset_14938.dat
+Reverse reads file.................: /mnt/galaxy/files/014/dataset_14939.dat
+PHRED..............................: 33
+Using empirical frequencies........: NO
+Statistical method.................: OES
+Maximum assembly length............: 999999
+Minimum assembly length............: 50
+p-value............................: 0.010000
+Quality score threshold (trimming).: 0
+Minimum read size after trimming...: 1
+Maximal ratio of uncalled bases....: 1.000000
+Minimum overlap....................: 10
+Scoring method.....................: Scaled score
+Threads............................: 2
+
+Allocating memory..................: 200,000,000 bytes
+Assemblying reads: 0%
+Assemblying reads: 100%
+
+Assembled reads ...................: 906 / 990 (91.515%)
+Discarded reads ...................: 0 / 990 (0.000%)
+Not assembled reads ...............: 84 / 990 (8.485%)
+Assembled reads file...............: pear.assembled.fastq
+Discarded reads file...............: pear.discarded.fastq
+Unassembled forward reads file.....: pear.unassembled.forward.fastq
+Unassembled reverse reads file.....: pear.unassembled.reverse.fastq
+==================================== =======================================
+
+
+-----
+
+=========
+Resources
+=========
+
+**Wrapper Authors**
+
+QFAB Bioinformatics (support@qfab.org)
+
+  </help>
+  <citations>
+       <citation type="doi">10.1093/bioinformatics/btt593</citation>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/F3D0.log	Mon Sep 14 04:50:28 2020 +0000
@@ -0,0 +1,36 @@
+ ____  _____    _    ____
+|  _ \| ____|  / \  |  _ \
+| |_) |  _|   / _ \ | |_) |
+|  __/| |___ / ___ \|  _ <
+|_|   |_____/_/   \_\_| \_\
+
+PEAR v0.9.10 [May 30, 2016]
+
+Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
+Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
+
+Forward reads file.................: /mnt/galaxy/files/011/dataset_11542.dat
+Reverse reads file.................: /mnt/galaxy/files/011/dataset_11543.dat
+PHRED..............................: 33
+Using empirical frequencies........: NO
+Statistical method.................: OES
+Maximum assembly length............: 999999
+Minimum assembly length............: 50
+p-value............................: 0.010000
+Quality score threshold (trimming).: 0
+Minimum read size after trimming...: 1
+Maximal ratio of uncalled bases....: 1.000000
+Minimum overlap....................: 10
+Scoring method.....................: Scaled score
+Threads............................: 2
+
+Allocating memory..................: 200,000,000 bytes
+Assemblying reads: 0%
Assemblying reads: 100%
+
+Assembled reads ...................: 91 / 99 (91.919%)
+Discarded reads ...................: 0 / 99 (0.000%)
+Not assembled reads ...............: 8 / 99 (8.081%)
+Assembled reads file...............: pear.assembled.fastq
+Discarded reads file...............: pear.discarded.fastq
+Unassembled forward reads file.....: pear.unassembled.forward.fastq
+Unassembled reverse reads file.....: pear.unassembled.reverse.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/F3D1.log	Mon Sep 14 04:50:28 2020 +0000
@@ -0,0 +1,36 @@
+ ____  _____    _    ____
+|  _ \| ____|  / \  |  _ \
+| |_) |  _|   / _ \ | |_) |
+|  __/| |___ / ___ \|  _ <
+|_|   |_____/_/   \_\_| \_\
+
+PEAR v0.9.10 [May 30, 2016]
+
+Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR
+Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593
+
+Forward reads file.................: /mnt/galaxy/files/011/dataset_11546.dat
+Reverse reads file.................: /mnt/galaxy/files/011/dataset_11547.dat
+PHRED..............................: 33
+Using empirical frequencies........: NO
+Statistical method.................: OES
+Maximum assembly length............: 999999
+Minimum assembly length............: 50
+p-value............................: 0.010000
+Quality score threshold (trimming).: 0
+Minimum read size after trimming...: 1
+Maximal ratio of uncalled bases....: 1.000000
+Minimum overlap....................: 10
+Scoring method.....................: Scaled score
+Threads............................: 2
+
+Allocating memory..................: 200,000,000 bytes
+Assemblying reads: 0%
Assemblying reads: 100%
+
+Assembled reads ...................: 89 / 97 (91.753%)
+Discarded reads ...................: 0 / 97 (0.000%)
+Not assembled reads ...............: 8 / 97 (8.247%)
+Assembled reads file...............: pear.assembled.fastq
+Discarded reads file...............: pear.discarded.fastq
+Unassembled forward reads file.....: pear.unassembled.forward.fastq
+Unassembled reverse reads file.....: pear.unassembled.reverse.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output.txt	Mon Sep 14 04:50:28 2020 +0000
@@ -0,0 +1,9 @@
+F3D0	Assembled reads ...................: 91 / 99 (91.919%)	Discarded reads ...................: 0 / 99 (0.000%)	Not assembled reads ...............: 8 / 99 (8.081%)
+F3D1	Assembled reads ...................: 89 / 97 (91.753%)	Discarded reads ...................: 0 / 97 (0.000%)	Not assembled reads ...............: 8 / 97 (8.247%)
+
+
+
+The above assessment has been performed on 1000 randomly selected reads per sample file.
+Average % of overlapping paired-end reads =	91.836
+If the average percentage is greater than 50%, you can consider using workflow 16S_biodiversity_for_overlap_PE.
+However, if the average percentage is less than 50%, use 16S_biodiversity_nonoverlap_PE.
\ No newline at end of file