Splitting Fastq file for high throughtput computing

Posted on 2015-08-06

Most of the time reads from the massively parallel sequencers (e.g. Illumina) are treated as individual records and undergo same analytic pipeline (adaptor trimming, mapping, etc.) in parallel. Run time can be improved if high-throughput computing (HTC) resources is available. As encouraged by HTC architecture, large files are splitted into smaller pacakge and run in a massively parallel way. In the world of genetics, often a large fastq file can be split to smaller fastq files and throw to the computing nodes. In this way, splitting files become a important step whenever results are coming down from the sequencer. However, raw fastq files are often come in gzip format, the native UNIX split command cannot take in gzip format and output gzip format. Thus, I have written a c++ program to make this more effective. This program supports gzip I/O. The code is hosted with my other fastq-tools on github.

usage: bin/splitFastq -i <fqfile> -n <# of record per file> -o <prefix> [-z]
[options]
-i    <fastq file> can be gzipped
-n    <number of record in each splitted file> default: 10000000
-o    <prefix>
-z    optional: gzip output

	#include <string.h>
	#include <cstring>
	#include <iostream>
	#include <fstream>
	#include <gzstream.h>
	#include <sstream>

	using namespace std;
	//get the id hash table and
	//iterate over the fastq file
	// determine which sequence to print out

	string fixfilenum(int filenum)
	{
	string out;
	ostringstream convert;
	if (filenum > 10)
	{
	convert << filenum;
	out = convert.str();
	}
	else
	{
	convert << filenum;
	out = "0" + convert.str();
	}
	return out;
	}

	void splitFastq(char *fqFile, string filePrefix, int recordNum)
	{
	// open fastq file for kseq parsing
	cerr << "From " << fqFile << "...." << endl;
	cerr << "Splitting " << recordNum << " records per file" << endl;
	int maxLine = recordNum * 4;
	int lineCount = 0, filenum = 0;
	string filename;
	igzstream in(fqFile);
	ofstream outFile;
	for (string line; getline(in,line);)
	{
	if (lineCount == 0)
	{
	filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq";
	outFile.open(filename.c_str());
	outFile << line << '\n';
	}
	else if (lineCount == maxLine)
	{
	outFile.close();
	cerr << "written " << filename << endl;
	lineCount = 0;
	filenum ++;
	filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq";
	outFile.open(filename.c_str());
	outFile << line << '\n';
	}
	else
	{
	outFile << line << '\n';
	}
	lineCount ++;
	}
	outFile.close();
	cerr << "written " << filename << endl;
	}


	void splitFastqZip(char *fqFile, string filePrefix, int recordNum)
	{
	// open fastq file for kseq parsing
	cerr << "From " << fqFile << "...." << endl;
	cerr << "Splitting " << recordNum << " records per file" << endl;
	int maxLine = recordNum * 4;
	int lineCount = 0, filenum = 0;
	string filename;
	igzstream in(fqFile);
	ogzstream outFile;
	for (string line; getline(in,line);)
	{
	if (lineCount == 0)
	{
	filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz";
	outFile.open(filename.c_str());
	outFile << line << '\n';
	}
	else if (lineCount == maxLine)
	{
	outFile.close();
	cerr << "written " << filename << endl;
	lineCount = 0;
	filenum ++;
	filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz";
	outFile.open(filename.c_str());
	outFile << line << '\n';
	}
	else
	{
	outFile << line << '\n';
	}
	lineCount ++;
	}
	outFile.close();
	cerr << "written " << filename << endl;
	}

	// print usage
	void usage(string programname)
	{
	cerr << "usage: "<< programname << " -i <fqfile> -n <# of record per file> -o <prefix> [-z]" << endl;
	cerr << "[options]" << endl;
	cerr << "-i <fastq file>" << endl;
	cerr << "-n <number of record in each splitted file> default: 10000000" << endl;
	cerr << "-o <prefix>" << endl;
	cerr << "-z optional: gzip output" << endl;
	}

	// main function
	int main(int argc, char **argv){
	char *fqFile;
	int c, recordNum = 10000000;
	int gz = 0;

	string programname = argv[0];
	string filePrefix = "";
	if (argc == 1){
	usage(programname);
	return 1;
	}

	opterr = 0;
	// print usage if not enough argumnets
	while ((c = getopt(argc, argv, "i:n:o:z")) != -1){
	switch (c){
	case 'i':
	fqFile = optarg;
	break;
	case 'n':
	recordNum = atoi(optarg);
	break;
	case 'o':
	filePrefix = optarg;
	break;
	case 'z':
	gz = 1;
	break;
	case '?':
	if (optopt == 'n' \|\| optopt == 'i' \|\| optopt== 'o'){
	cerr << "option n, i, p need arguments!" << endl;
	usage(programname);
	}
	else {
	usage(programname);
	}
	return 1;
	default:
	abort();
	}
	}
	if (filePrefix == "" \|\| strcmp(fqFile,"") == 0)
	{
	usage(programname);
	return 1;
	}

	// pass variable to fnuction
	if (gz == 0)
	{
	splitFastq(fqFile, filePrefix, recordNum);
	}
	else
	{
	splitFastqZip(fqFile, filePrefix, recordNum);

	}
	return 0;
	}

view raw splitFastq.cpp hosted with ❤ by GitHub