diff --git a/.gitignore b/.gitignore index 54264884e..8acc0f525 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ src/utils/version/version_git.h .cproject nbproject sandbox +.directory +.vscode/* test/general/o test/genomecov/chip.bam test/genomecov/one_block.bam diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 0b6a3464f..e5deedd91 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -22,7 +22,8 @@ BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, bool only_5p_end, bool only_3p_end, bool pair_chip, bool haveSize, int fragmentSize, bool dUTP, bool eachBaseZeroBased, - bool add_gb_track_line, string gb_track_line_opts) { + bool add_gb_track_line, string gb_track_line_opts, + int extensionSize, bool tn5) { _bedFile = bedFile; _genomeFile = genomeFile; @@ -45,6 +46,8 @@ BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, _dUTP = dUTP; _add_gb_track_line = add_gb_track_line; _gb_track_line_opts = gb_track_line_opts; + _extensionSize = extensionSize; + _tn5 = tn5; _currChromName = ""; _currChromSize = 0 ; @@ -141,14 +144,47 @@ void BedGenomeCoverage::AddCoverage(CHRPOS start, CHRPOS end) { } -void BedGenomeCoverage::AddBlockedCoverage(const vector &bedBlocks) { +void BedGenomeCoverage::AddBlockedCoverage(const vector &bedBlocks,string strand) { vector::const_iterator bedItr = bedBlocks.begin(); vector::const_iterator bedEnd = bedBlocks.end(); + bool isEmpty=(bedItr==bedEnd); + bool isFirst=true; + int pos_start=0; + int pos_end=0; for (; bedItr != bedEnd; ++bedItr) { + // I need to Add the Coverage of the previous step as the final step has + // additional modifications + if(!isFirst){ + if (pos_start<0) { + AddCoverage(0,pos_end); + } + else + AddCoverage(pos_start,pos_end); + } // the end - 1 must be done because BamAncillary::getBamBlocks // returns ends uncorrected for the genomeCoverageBed data structure. // ugly, but necessary. - AddCoverage(bedItr->start, bedItr->end - 1); + pos_start=bedItr->start; + pos_end=bedItr->end - 1; + if (isFirst) { + if (_tn5 && (strand=="+")){ + pos_start = pos_start+4; + } + pos_start = pos_start - _extensionSize; + isFirst=false; + } + } + if (!isEmpty){ + // I modify the last block + if (_tn5 && (strand=="-")){ + pos_end = pos_end-5; + } + pos_end = pos_end + _extensionSize; + if (pos_start<0) { + AddCoverage(0,pos_end); + } + else + AddCoverage(pos_start,pos_end); } } @@ -189,18 +225,41 @@ void BedGenomeCoverage::CoverageBed() { if (_obeySplits == true) { bedVector bedBlocks; // vec to store the discrete BED "blocks" GetBedBlocks(a, bedBlocks); - AddBlockedCoverage(bedBlocks); + AddBlockedCoverage(bedBlocks,a.strand); } else if (_only_5p_end) { CHRPOS pos = ( a.strand=="+" ) ? a.start : a.end-1; - AddCoverage(pos,pos); + if (_tn5) { + pos = ( a.strand=="+" ) ? pos+4 : pos-5; + } + if ( pos<_extensionSize ) { //sometimes extensionSize is bigger :( + AddCoverage(0, pos+_extensionSize); + } + else { + AddCoverage(pos-_extensionSize, pos+_extensionSize ); + } } else if (_only_3p_end) { CHRPOS pos = ( a.strand=="-" ) ? a.start : a.end-1; - AddCoverage(pos,pos); + if ( pos<_extensionSize ) { //sometimes extensionSize is bigger :( + AddCoverage(0, pos+_extensionSize); + } + else { + AddCoverage(pos-_extensionSize, pos+_extensionSize ); + } } else { - AddCoverage(a.start, a.end-1); + CHRPOS pos_start=a.start; + CHRPOS pos_end=a.end-1; + if (_tn5) { + pos_start = ( a.strand=="+" ) ? pos_start+4 : pos_start; + pos_end = ( a.strand=="-" ) ? pos_end-5 : pos_end; + } + if ( pos_start<_extensionSize ) { + AddCoverage(0,pos_end+_extensionSize); + } + else + AddCoverage(pos_start-_extensionSize,pos_end+_extensionSize); } } } @@ -326,20 +385,52 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left - AddCoverage(bam.MatePosition, end); + int pos_start=bam.MatePosition; + int pos_end=end; + if (_tn5) { + pos_start = pos_start+4; + pos_end = pos_end-5; + } + if ( pos_start<_extensionSize ) { + AddCoverage(0,pos_end+_extensionSize); + } + else + AddCoverage(pos_start-_extensionSize,pos_end+_extensionSize); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right - AddCoverage(start, start + abs(bam.InsertSize) - 1); + int pos_start=start; + int pos_end=start + abs(bam.InsertSize) - 1; + if (_tn5) { + pos_start = pos_start+4; + pos_end = pos_end-5; + } + if ( pos_start<_extensionSize ) { + AddCoverage(0,pos_end+_extensionSize); + } + else + AddCoverage(pos_start-_extensionSize,pos_end+_extensionSize); } } else if (_haveSize) { if(bam.IsReverseStrand()) { - if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( - AddCoverage(0, end); + int pos=end; + if (_tn5){ + pos=pos-5; + } + if(pos<(_fragmentSize+_extensionSize)) { //sometimes fragmentSize is bigger :( + AddCoverage(0, pos); } else { - AddCoverage(end + 1 - _fragmentSize, end ); + AddCoverage(pos + 1 - _fragmentSize - _extensionSize, pos + _extensionSize); } } else { - AddCoverage(start,start+_fragmentSize - 1); + int pos=start; + if (_tn5){ + pos=pos+4; + } + if(pos<_extensionSize){ + AddCoverage(0,pos+_fragmentSize - 1+_extensionSize); + } + else + AddCoverage(pos-_extensionSize,pos+_fragmentSize - 1+_extensionSize); } } else // add coverage accordingly. @@ -353,15 +444,29 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } - AddBlockedCoverage(bedBlocks); + string readStrand = ( !bam.IsReverseStrand() ) ? "+" : "-"; + AddBlockedCoverage(bedBlocks, readStrand); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; - AddCoverage(pos,pos); + if (_tn5) { + pos = ( !bam.IsReverseStrand() ) ? pos+4 : pos-5; + } + if ( pos<_extensionSize ) { //sometimes extensionSize is bigger :( + AddCoverage(0, pos+_extensionSize); + } + else { + AddCoverage(pos-_extensionSize, pos+_extensionSize ); + } } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; - AddCoverage(pos,pos); + if ( pos<_extensionSize ) { //sometimes extensionSize is bigger :( + AddCoverage(0, pos+_extensionSize); + } + else { + AddCoverage(pos-_extensionSize, pos+_extensionSize ); + } } } // close the BAM diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index 723883c26..c10c8dd9c 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -50,7 +50,8 @@ class BedGenomeCoverage { bool only_5p_end, bool only_3p_end, bool pair_chip,bool haveSize, int fragmentSize, bool dUTP, bool eachBaseZeroBased, - bool add_gb_track_line, string gb_track_line_opts); + bool add_gb_track_line, string gb_track_line_opts, + int extensionSize, bool tn5); // destructor ~BedGenomeCoverage(void); @@ -79,6 +80,8 @@ class BedGenomeCoverage { bool _add_gb_track_line; string _gb_track_line_opts; string _requestedStrand; + int _extensionSize; + bool _tn5; BedFile *_bed; NewGenomeFile *_genome; @@ -102,7 +105,7 @@ class BedGenomeCoverage { void ResetChromCoverage(); void StartNewChrom (const string& chrom); void AddCoverage (CHRPOS start, CHRPOS end); - void AddBlockedCoverage(const vector &bedBlocks); + void AddBlockedCoverage(const vector &bedBlocks, string strand); void PrintFinalCoverage(); void PrintEmptyChromosomes(); void PrintTrackDefinitionLine(); diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp index 942d6ef59..81a1226bf 100644 --- a/src/genomeCoverageBed/genomeCoverageMain.cpp +++ b/src/genomeCoverageBed/genomeCoverageMain.cpp @@ -33,6 +33,7 @@ int genomecoverage_main(int argc, char* argv[]) { string bedFile; string genomeFile; int max = INT_MAX; + int extensionSize = 0; float scale = 1.0; float fragmentSize = 146; //Nucleosome :) @@ -53,6 +54,7 @@ int genomecoverage_main(int argc, char* argv[]) { bool only_5p_end = false; bool only_3p_end = false; bool add_gb_track_line = false; + bool tn5 = false; string gb_track_opts; string requestedStrand = "X"; @@ -173,6 +175,18 @@ int genomecoverage_main(int argc, char* argv[]) { showHelp = true; } } + else if(PARAMETER_CHECK("-ext", 4, parameterLength)) { + if ((i+1) < argc) { + extensionSize = atoi(argv[i+1]); + i++; + } else { + cerr << "*****ERROR: -ext options requires an integer value" << endl; + showHelp = true; + } + } + else if(PARAMETER_CHECK("-tn5", 4, parameterLength)) { + tn5 = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -207,6 +221,16 @@ int genomecoverage_main(int argc, char* argv[]) { showHelp = true; } + /*if ( tn5 && obeySplits) { + cerr << endl << "*****" << endl << "*****ERROR: Use -split can't be used with -tn5." << endl << "*****" << endl; + showHelp = true; + } + + if ( (extensionSize>0) && obeySplits) { + cerr << endl << "*****" << endl << "*****ERROR: Use -split can't be used with -ext." << endl << "*****" << endl; + showHelp = true; + }*/ + if (add_gb_track_line && !(bedGraph||bedGraphAll)) { cerr << endl << "*****" << endl << "*****ERROR: Using -trackline requires bedGraph output (use -bg or -bga)." << endl << "*****" << endl; showHelp = true; @@ -225,7 +249,8 @@ int genomecoverage_main(int argc, char* argv[]) { only_5p_end, only_3p_end, pair_chip, haveSize, fragmentSize, dUTP, eachBaseZeroBased, - add_gb_track_line, gb_track_opts); + add_gb_track_line, gb_track_opts, + extensionSize, tn5); delete bc; } else { @@ -280,7 +305,7 @@ void genomecoverage_help(void) { cerr << "\t-fs\t\t" << "Force to use provided fragment size instead of read length" << endl; cerr << "\t\t\tWorks for BAM files only" << endl; - cerr << "\t-du\t\t" << "Change strand af the mate read (so both reads from the same strand) useful for strand specific" << endl; + cerr << "\t-du\t\t" << "Change strand of the mate read (so both reads from the same strand) useful for strand specific" << endl; cerr << "\t\t\tWorks for BAM files only" << endl; cerr << "\t-5\t\t" << "Calculate coverage of 5\" positions (instead of entire interval)." << endl << endl; @@ -303,7 +328,8 @@ void genomecoverage_help(void) { cerr <<"\t\t\t http://genome.ucsc.edu/goldenPath/help/bedgraph.html" << endl; cerr <<"\t\t\t- NOTE: When adding a trackline definition, the output BedGraph can be easily" << endl; cerr <<"\t\t\t uploaded to the Genome Browser as a custom track," << endl; - cerr <<"\t\t\t BUT CAN NOT be converted into a BigWig file (w/o removing the first line)." << endl << endl; + //cerr <<"\t\t\t BUT CAN NOT be converted into a BigWig file (w/o removing the first line)." << endl << endl; + // With v 4 there is no issue. cerr << "\t-trackopts\t"<<"Writes additional track line definition parameters in the first line." << endl; cerr <<"\t\t\t- Example:" << endl; @@ -311,6 +337,16 @@ void genomecoverage_help(void) { cerr <<"\t\t\t Note the use of single-quotes if you have spaces in your parameters." << endl; cerr <<"\t\t\t- (TEXT)" << endl << endl; + cerr << "\t-ext\t\t"<<"Extends the coverage in both directions of the desired number of bases." << endl; + cerr << "\t\t\tUseful when you have very sparse data like when you use -5 or -3." << endl; + cerr << "\t\t\t- Default is 0; i.e., no extension." << endl; + cerr << "\t\t\t- (INT)" << endl << endl; + + cerr << "\t-tn5\t\t"<<"Shifts the 5' to match the insertion site of the Tn5." << endl; + cerr << "\t\t\tIt will shift the 5' extremity of 5bp to the left for reverse strand" << endl; + cerr << "\t\t\tand 4bp to the right for forward strand." << endl; + cerr << "\t\t\tUseful when you are working with ATAC-seq data and you want to see the footprint." << endl; + cerr << "Notes: " << endl; cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; cerr << "\t " << endl << endl; diff --git a/test/genomecov/test-genomecov.sh b/test/genomecov/test-genomecov.sh index 15ec8a292..a011a2030 100755 --- a/test/genomecov/test-genomecov.sh +++ b/test/genomecov/test-genomecov.sh @@ -260,8 +260,6 @@ $BT genomecov -ibam chip.bam -bg -fs 100 > obs check obs exp rm obs exp -rm one_block.bam two_blocks.bam three_blocks.bam sam-w-del.bam pair-chip.bam chip.bam - ################################################################## # Make sure empty bam doesn't cause failure ################################################################## @@ -286,4 +284,29 @@ CRAM_REFERENCE=test_ref.fa $BT genomecov -ibam empty.cram > obs check obs exp rm obs exp +################################################################## +# Test chip with tn5 +################################################################## +echo -e " genomecov.t18...\c" +echo \ +"chr1 5 76 1 +chr1 225 295 1" > exp +$BT genomecov -ibam chip.bam -bg -tn5 > obs +check obs exp +rm obs exp + +################################################################## +# Test chip with ext +################################################################## +echo -e " genomecov.t19...\c" +echo \ +"chr1 0 86 1 +chr1 215 310 1" > exp +$BT genomecov -ibam chip.bam -bg -ext 10 > obs +check obs exp +rm obs exp + + +rm one_block.bam two_blocks.bam three_blocks.bam sam-w-del.bam pair-chip.bam chip.bam + [[ $FAILURES -eq 0 ]] || exit 1;