Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ fastp

# Test Output
*.json
*.html
*.html
11 changes: 4 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ DIR_OBJ := ./obj

PREFIX ?= /usr/local
BINDIR ?= $(PREFIX)/bin
INCLUDE_DIRS ?=
LIBRARY_DIRS ?=
INCLUDE_DIRS ?=
LIBRARY_DIRS ?=

SRC := $(wildcard ${DIR_SRC}/*.cpp)
OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC}))
Expand All @@ -16,7 +16,7 @@ BIN_TARGET := ${TARGET}

CXX ?= g++
CXXFLAGS := -std=c++11 -pthread -g -O3 -MD -MP -I. -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS}
LIBS := -lisal -ldeflate -lhwy -lpthread
LIBS := -lisal -ldeflate -lzstd -lhwy -lpthread

UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
Expand All @@ -27,10 +27,7 @@ else
FIND_STATIC = $(firstword $(foreach d,$(LIBRARY_DIRS),$(wildcard $(d)/lib$(1).a)) $(wildcard /usr/local/lib/lib$(1).a /opt/homebrew/lib/lib$(1).a))
STATIC_LIBS :=
DYNAMIC_LIBS :=
$(foreach lib,isal deflate hwy,\
$(if $(call FIND_STATIC,$(lib)),\
$(eval STATIC_LIBS += $(call FIND_STATIC,$(lib))),\
$(eval DYNAMIC_LIBS += -l$(lib))))
$(foreach lib,isal deflate zstd hwy, $(if $(call FIND_STATIC,$(lib)), $(eval STATIC_LIBS += $(call FIND_STATIC,$(lib))), $(eval DYNAMIC_LIBS += -l$(lib))))
LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_LIBS) $(DYNAMIC_LIBS) -lpthread $(LD_FLAGS)
endif

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ If you use fastp in your work, you can cite fastp as: *Shifu Chen. fastp 1.0: A
11. support long reads (data from PacBio / Nanopore devices).
12. support reading from STDIN and writing to STDOUT
13. support interleaved input
14. support reading Zstandard-compressed FASTQ/FASTA files (`.zst` / `.zstd`)
14. support ultra-fast FASTQ-level deduplication
15. ...

Expand Down Expand Up @@ -182,6 +183,7 @@ make -j INCLUDE_DIRS=/opt/homebrew/include LIBRARY_DIRS=/opt/homebrew/lib
* for PE data, you should also specify read2 input by `-I` or `--in2`, and specify read2 output by `-O` or `--out2`.
* if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering.
* the output will be gzip-compressed if its file name ends with `.gz`
* the input can be gzip-compressed (`.gz`) or Zstandard-compressed (`.zst`, `.zstd`); compression is auto-detected from the extension
## output to STDOUT
`fastp` supports streaming the passing-filter reads to STDOUT, so that it can be passed to other compressors like `bzip2`, or be passed to aligners like `bwa` and `bowtie2`.
* specify `--stdout` to enable this mode to stream output to STDOUT
Expand Down
121 changes: 115 additions & 6 deletions src/fastqreader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,24 @@ FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){
mHasNoLineBreakAtEnd = false;
mGzipInputUsedBytes = 0;
mReadPool = NULL;
mUseZstd = false;
mZstdFinished = false;
mZstdStream = NULL;
mZstdInput.src = NULL;
mZstdInput.size = 0;
mZstdInput.pos = 0;
mZstdInputUsedBytes = 0;
init();
}

FastqReader::~FastqReader(){
close();
delete[] mFastqBuf;
delete[] mGzipInputBuffer;
if(mZstdStream){
ZSTD_freeDStream(mZstdStream);
mZstdStream = NULL;
}
}

bool FastqReader::hasNoLineBreakAtEnd() {
Expand All @@ -69,11 +80,13 @@ void FastqReader::setReadPool(ReadPool* rp) {


bool FastqReader::bufferFinished() {
if(mZipped) {
return eof() && mGzipState.avail_in == 0;
} else {
if(!mZipped)
return eof();
}

if(mUseZstd)
return mZstdFinished && mZstdInput.pos == mZstdInput.size;

return eof() && mGzipState.avail_in == 0;
}

void FastqReader::readToBufIgzip(){
Expand Down Expand Up @@ -139,10 +152,67 @@ void FastqReader::readToBufIgzip(){
}
}

void FastqReader::readToBufZstd(){
mBufDataLen = 0;
if(mZstdFinished)
return;

ZSTD_outBuffer outBuf;
outBuf.dst = mFastqBuf;
outBuf.size = mGzipOutputBufferSize;
outBuf.pos = 0;

while(outBuf.pos == 0){
if(mZstdInput.pos == mZstdInput.size){
size_t readBytes = fread(mGzipInputBuffer, 1, mGzipInputBufferSize, mFile);
if(readBytes == 0){
if(eof()){
mZstdFinished = true;
break;
} else {
error_exit("zstd: read error on file: " + mFilename);
}
}
mZstdInput.src = mGzipInputBuffer;
mZstdInput.size = readBytes;
mZstdInput.pos = 0;
mZstdInputUsedBytes += readBytes;
}

size_t ret = ZSTD_decompressStream(mZstdStream, &outBuf, &mZstdInput);
if(ZSTD_isError(ret)){
error_exit("zstd: decompression error for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(ret)));
}

if(ret == 0){
if(mZstdInput.pos < mZstdInput.size || !eof()){
size_t resetRet = ZSTD_initDStream(mZstdStream);
if(ZSTD_isError(resetRet)){
error_exit("zstd: failed to reset stream for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(resetRet)));
}
} else {
mZstdFinished = true;
}
}

if(eof() && mZstdInput.pos == mZstdInput.size && ret != 0){
error_exit("zstd: unexpected eof found in file: " + mFilename);
}

if(mZstdFinished || outBuf.pos > 0)
break;
}

mBufDataLen = outBuf.pos;
}

void FastqReader::readToBuf() {
mBufDataLen = 0;
if(mZipped) {
readToBufIgzip();
if(mUseZstd)
readToBufZstd();
else
readToBufIgzip();
} else {
if(!eof())
mBufDataLen = fread(mFastqBuf, 1, FQ_BUF_SIZE, mFile);
Expand Down Expand Up @@ -173,6 +243,26 @@ void FastqReader::init(){
}
mZipped = true;
}
else if (ends_with(mFilename, ".zst") || ends_with(mFilename, ".zstd")){
mFile = fopen(mFilename.c_str(), "rb");
if(mFile == NULL) {
error_exit("Failed to open file: " + mFilename);
}
mZstdStream = ZSTD_createDStream();
if(mZstdStream == NULL) {
error_exit("zstd: failed to allocate decompressor for file: " + mFilename);
}
size_t ret = ZSTD_initDStream(mZstdStream);
if(ZSTD_isError(ret)){
error_exit("zstd: failed to init decompressor for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(ret)));
}
mZipped = true;
mUseZstd = true;
mZstdFinished = false;
mZstdInput.src = mGzipInputBuffer;
mZstdInput.size = 0;
mZstdInput.pos = 0;
}
else {
if(mFilename == "/dev/stdin") {
mFile = stdin;
Expand All @@ -189,7 +279,10 @@ void FastqReader::init(){

void FastqReader::getBytes(size_t& bytesRead, size_t& bytesTotal) {
if(mZipped) {
bytesRead = mGzipInputUsedBytes - mGzipState.avail_in;
if(mUseZstd)
bytesRead = mZstdInputUsedBytes - (mZstdInput.size - mZstdInput.pos);
else
bytesRead = mGzipInputUsedBytes - mGzipState.avail_in;
} else {
bytesRead = ftell(mFile);//mFile.tellg();
}
Expand Down Expand Up @@ -362,6 +455,22 @@ bool FastqReader::isZipFastq(string filename) {
return true;
else if (ends_with(filename, ".fa.gz"))
return true;
else if (ends_with(filename, ".fastq.zst"))
return true;
else if (ends_with(filename, ".fq.zst"))
return true;
else if (ends_with(filename, ".fastq.zstd"))
return true;
else if (ends_with(filename, ".fq.zstd"))
return true;
else if (ends_with(filename, ".fasta.zst"))
return true;
else if (ends_with(filename, ".fa.zst"))
return true;
else if (ends_with(filename, ".fasta.zstd"))
return true;
else if (ends_with(filename, ".fa.zstd"))
return true;
else
return false;
}
Expand Down
7 changes: 7 additions & 0 deletions src/fastqreader.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ SOFTWARE.
#else
#include "igzip_lib.h"
#endif
#include <zstd.h>
#include "readpool.h"

class FastqReader{
Expand Down Expand Up @@ -65,6 +66,7 @@ class FastqReader{
void clearLineBreaks(char* line);
void readToBuf();
void readToBufIgzip();
void readToBufZstd();
bool bufferFinished();

private:
Expand All @@ -87,6 +89,11 @@ class FastqReader{
bool mHasQuality;
bool mPhred64;
ReadPool* mReadPool;
bool mUseZstd;
bool mZstdFinished;
ZSTD_DStream* mZstdStream;
ZSTD_inBuffer mZstdInput;
size_t mZstdInputUsedBytes;

};

Expand Down
Binary file added testdata/R1.fq.zst
Binary file not shown.
Binary file added testdata/R2.fq.zst
Binary file not shown.