Most of the time reads from the massively parallel sequencers (e.g. Illumina) are treated as individual records and undergo same analytic pipeline (adaptor trimming, mapping, etc.) in parallel. Run time can be improved if high-throughput computing (HTC) resources is available. As encouraged by HTC architecture, large files are splitted into smaller pacakge and run in a massively parallel way. In the world of genetics, often a large fastq file can be split to smaller fastq files and throw to the computing nodes. In this way, splitting files become a important step whenever results are coming down from the sequencer. However, raw fastq files are often come in gzip format, the native UNIX split command cannot take in gzip format and output gzip format. Thus, I have written a c++ program to make this more effective. This program supports gzip I/O. The code is hosted with my other fastq-tools on github.
usage: bin/splitFastq -i <fqfile> -n <# of record per file> -o <prefix> [-z]
[options]
-i <fastq file> can be gzipped
-n <number of record in each splitted file> default: 10000000
-o <prefix>
-z optional: gzip output
#include <string.h> | |
#include <cstring> | |
#include <iostream> | |
#include <fstream> | |
#include <gzstream.h> | |
#include <sstream> | |
using namespace std; | |
//get the id hash table and | |
//iterate over the fastq file | |
// determine which sequence to print out | |
string fixfilenum(int filenum) | |
{ | |
string out; | |
ostringstream convert; | |
if (filenum > 10) | |
{ | |
convert << filenum; | |
out = convert.str(); | |
} | |
else | |
{ | |
convert << filenum; | |
out = "0" + convert.str(); | |
} | |
return out; | |
} | |
void splitFastq(char *fqFile, string filePrefix, int recordNum) | |
{ | |
// open fastq file for kseq parsing | |
cerr << "From " << fqFile << "...." << endl; | |
cerr << "Splitting " << recordNum << " records per file" << endl; | |
int maxLine = recordNum * 4; | |
int lineCount = 0, filenum = 0; | |
string filename; | |
igzstream in(fqFile); | |
ofstream outFile; | |
for (string line; getline(in,line);) | |
{ | |
if (lineCount == 0) | |
{ | |
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq"; | |
outFile.open(filename.c_str()); | |
outFile << line << '\n'; | |
} | |
else if (lineCount == maxLine) | |
{ | |
outFile.close(); | |
cerr << "written " << filename << endl; | |
lineCount = 0; | |
filenum ++; | |
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq"; | |
outFile.open(filename.c_str()); | |
outFile << line << '\n'; | |
} | |
else | |
{ | |
outFile << line << '\n'; | |
} | |
lineCount ++; | |
} | |
outFile.close(); | |
cerr << "written " << filename << endl; | |
} | |
void splitFastqZip(char *fqFile, string filePrefix, int recordNum) | |
{ | |
// open fastq file for kseq parsing | |
cerr << "From " << fqFile << "...." << endl; | |
cerr << "Splitting " << recordNum << " records per file" << endl; | |
int maxLine = recordNum * 4; | |
int lineCount = 0, filenum = 0; | |
string filename; | |
igzstream in(fqFile); | |
ogzstream outFile; | |
for (string line; getline(in,line);) | |
{ | |
if (lineCount == 0) | |
{ | |
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz"; | |
outFile.open(filename.c_str()); | |
outFile << line << '\n'; | |
} | |
else if (lineCount == maxLine) | |
{ | |
outFile.close(); | |
cerr << "written " << filename << endl; | |
lineCount = 0; | |
filenum ++; | |
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz"; | |
outFile.open(filename.c_str()); | |
outFile << line << '\n'; | |
} | |
else | |
{ | |
outFile << line << '\n'; | |
} | |
lineCount ++; | |
} | |
outFile.close(); | |
cerr << "written " << filename << endl; | |
} | |
// print usage | |
void usage(string programname) | |
{ | |
cerr << "usage: "<< programname << " -i <fqfile> -n <# of record per file> -o <prefix> [-z]" << endl; | |
cerr << "[options]" << endl; | |
cerr << "-i <fastq file>" << endl; | |
cerr << "-n <number of record in each splitted file> default: 10000000" << endl; | |
cerr << "-o <prefix>" << endl; | |
cerr << "-z optional: gzip output" << endl; | |
} | |
// main function | |
int main(int argc, char **argv){ | |
char *fqFile; | |
int c, recordNum = 10000000; | |
int gz = 0; | |
string programname = argv[0]; | |
string filePrefix = ""; | |
if (argc == 1){ | |
usage(programname); | |
return 1; | |
} | |
opterr = 0; | |
// print usage if not enough argumnets | |
while ((c = getopt(argc, argv, "i:n:o:z")) != -1){ | |
switch (c){ | |
case 'i': | |
fqFile = optarg; | |
break; | |
case 'n': | |
recordNum = atoi(optarg); | |
break; | |
case 'o': | |
filePrefix = optarg; | |
break; | |
case 'z': | |
gz = 1; | |
break; | |
case '?': | |
if (optopt == 'n' || optopt == 'i' || optopt== 'o'){ | |
cerr << "option n, i, p need arguments!" << endl; | |
usage(programname); | |
} | |
else { | |
usage(programname); | |
} | |
return 1; | |
default: | |
abort(); | |
} | |
} | |
if (filePrefix == "" || strcmp(fqFile,"") == 0) | |
{ | |
usage(programname); | |
return 1; | |
} | |
// pass variable to fnuction | |
if (gz == 0) | |
{ | |
splitFastq(fqFile, filePrefix, recordNum); | |
} | |
else | |
{ | |
splitFastqZip(fqFile, filePrefix, recordNum); | |
} | |
return 0; | |
} |