view NGSrich_0.5.5/src/filters/ReadFilter.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line source

package filters;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import datastructures.ReadLine;

public class ReadFilter extends Filter{

	File input, output;
	
	/**
	* Constructs a SamAdapter object. The output of an adaption is written to the
	* given file outputFileName.
	* 
	* @param inputFileName the name of the read alignment input file.
	* @param outputFileName the name of the output file containing the reduced 
	* format of the read alignment file. It must ends with ".red".
	*/
	public ReadFilter(String inputFileName, String outputFileName) {
		super(inputFileName, outputFileName);
		input = new File(inputFileName);
		output = new File(outputFileName);
	}
	
	/**
	 * <P>
	 * Uses ReadLine zu reduce each line of the read alignment file to following format:<BR>
	 * &#60;name> &#60;chrom> &#60;start> &#60;end> (tab delimited).
	 * </P>
	 * In the following we list the 12 fields of the sam-alignment-file. We mark the fields we are 
	 * interessted in with (!!):
	 * <PRE>
	 * 1. &#60;QNAME>	: Query pair NAME if paired; or Query NAME if unpaired 	(Ex: 6:105:18438:14421)	(!!)
	 * 2. &#60;FLAG>	: bitwise FLAG a₀a₁a₂a₃a₄a₅a₆a₇a₈a₉a₁₀		(Ex: 0 forward, 16 reverse strand)
	 *		  a₀ : 	the read is paired in sequencing, (no matter whether it is mapped in a pair)
	 *		  a₁ : 	the read is mapped in a proper pair	
	 *		  a₂ : 	the query sequence itself is unmapped
	 *		  a₃ : 	the mate is unmapped
	 *		  a₄ : 	strand of the query (0 for forward; 1 for reverse strand)
	 *		  a₅ : 	strand of the mate
	 *		  a₆ : 	the read is the first read in a pair
	 *		  a₇ : 	the read is the second read in a pair
	 *		  a₈ : 	the alignment is not primary
	 *		  a₉ : 	the read fails platform/vendor quality checks
	 *  		  a₁₀: 	the read is either a PCR duplicate or an optical duplicate
	 * 3. &#60;RNAME>	: Reference sequence NAME					(Ex: chr10)			(!!)
	 * 4. &#60;POS>		: 1-based leftmost POSition/coordinate of the clipped sequence 	(Ex: 60041)			(!!)
	 * 5. &#60;MAPQ>	: MAPping Quality 						(Ex: 0)
	 *		 			  (phred-scaled posterior probability that the mapping position of this read is incorrect)
	 * 6. &#60;CIGAR>	: extended CIGAR string						(Ex: 150M)
	 * 7. &#60;MRNM>	: Mate Reference sequence NaMe; “=” if the same as &#60;RNAME>	(Ex:*)
	 * 8. &#60;MPOS>	: 1-based leftmost Mate POSition of the clipped sequence	(Ex: 0)
	 * 9. &#60;ISIZE>	: inferred Insert SIZE						(Ex: 0)
	 * 10. &#60;SEQ>	: query SEQuence; “=” for a match to the reference; n/N/. for ambiguity; cases are not maintained (!!)
	 *					(Ex:	TGTTGTTGTTATTTCTGAATGACATTTACTTTGCTGCTCTTTATTTTGCG
	 *							TATTTAAAACTATTAGATCGTGTGATTATATTTGACAGGTCTTAATTGAC
	 *							GCGCTGTTCAGCCCTTTGAGTTCGGTTGAGTTTTGTGTTGGAGAATTTTC)
	 * 11. &#60;QUAL>	: query QUALity; ASCII-33 gives the Phred base quality
	 *					(Ex: 	/.8349-7:95@=8999;1:=;===AABD:=@A;>AD:E:9@==69<;@B3CBC@B8B;B89=8=3;@@@.:->>B?
	 *							C4CBB8EDGDD8GDEEDEEE8EBA9B???=B;,8:+5;;A??>?#############################)
	 * 12. [&#60;TAG>:&#60;VTYPE>:&#60;VALUE> [...]]: TAG/Value TYPE/match <VTYPE> (space allowed)
	 * 			(Ex:	XT:A:R	NM:i:2	X0:i:2	X1:i:0)
	 * </PRE>
	 */
	public void filter() {
		FileWriter fw = null;
		Scanner s = null;
		
		try {
			s= new Scanner(input);
		} catch (FileNotFoundException e) {
			System.err.println("sam file not found");
			e.printStackTrace();
		}
		
		try {
			if(output == null){
				output = new File(input.getName().
										substring(0,input.getName().lastIndexOf("."))+".rsam");
			}
			
			fw = new FileWriter(output);
		
		} catch (IOException e) {
			System.err.println("Error generating rsam file");
			e.printStackTrace();
		}
		
		String rawline;
		ReadLine line = null;
		
		do{
			rawline = s.nextLine();
		}while(rawline.startsWith("@"));
		
		do{
			try {
				line = new ReadLine(rawline);
				fw.write(line+"\r\n");
			} catch (IOException e) {
				System.err.println("Error writing reduced form of:\n"+rawline);
				e.printStackTrace();
			}
			if(s.hasNextLine())
				rawline = s.nextLine();
		}while(s.hasNextLine());
		
		
		try {
			fw.write(line +"\r\n");
		} catch (IOException e) {
			System.err.println("Error writing reduced form of:\n"+line);
			e.printStackTrace();
		}
		
		try {
			fw.close();
		} catch (IOException e) {
			System.err.println("Error closing file");
			e.printStackTrace();
		}
		s.close();
		
		System.out.println("READS FILE:");
		System.out.println(input.getAbsolutePath()+" reduced to "+
				output.getAbsolutePath());
		sort();
	}
	

	public void sort() {
		Runtime rt = Runtime.getRuntime();
		try {
			String rawOutput = output.getAbsolutePath();
			String outputName = output.getName();
			String pathname = output.getParentFile().getAbsolutePath()+"/"+outputName+"Sorted";
			
			output = new File(pathname);
			String tmpD=output.getParentFile().getAbsolutePath();

			if(!output.exists())output.createNewFile();
			String command = "sort -k2,2 -k3n,3 -T "+tmpD+" "+rawOutput;
			Process p = rt.exec(command);
			Scanner ps = new Scanner(p.getInputStream());

			FileWriter fw = new FileWriter(output);
			while(ps.hasNextLine()){
				String nextLine = ps.nextLine();
				fw.write(nextLine+"\n");
			}
			fw.close();

			Scanner psStdErr=new Scanner(p.getErrorStream());
			while(psStdErr.hasNextLine()){
				String errLine=psStdErr.nextLine();
				System.out.println(errLine);
			}
		
			new File(rawOutput).delete();
			new File(pathname).renameTo(new File(rawOutput));
			System.out.println("Reduced file "+new File(rawOutput).getAbsolutePath()+" sorted\n");
			
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}

	
	public String toString(){
		return "ReadFilter";
	}

	
}