Issues #535, #1350: fixed a long-standing problem that resulted in a seg-fault whem mapping to the rabbit genome. Issue #1223: fixed the N_unmapped value reported in ReadsPerGene.out.tab. The single-end (i.e. partially mapped alignment are not excluded from N_unmapped. dev_EoI_2.7.9a_2021-09-30

alexdobin · alexdobin · commit 12beb0c52367 · 2021-09-30T15:12:27.000-04:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,5 @@
+* Issue #1223: fixed the N_unmapped value reported in ReadsPerGene.out.tab. The single-end (i.e. partially mapped alignment are not excluded from N_unmapped.
+* Issues #535, #1350: fixed a long-standing problem that resulted in a seg-fault whem mapping to the rabbit genome.
 * Issue #1316: fixed the seg-fault which occurred if --soloType CB_samTagOut and --soloCBwhitelist None are used together.
 * Changed Solo summary statistics outputs in Barcodes.stats and Features.stats files.
 * Implemented --soloCBmatchWLtype ParseBio_ED3 to allow multiple mismatches and one insertion+deletion for --soloType CB_UMI_Complex.
diff --git a/bin/Linux_x86_64/STAR b/bin/Linux_x86_64/STAR
diff --git a/bin/Linux_x86_64/STARlong b/bin/Linux_x86_64/STARlong
diff --git a/bin/Linux_x86_64_static/STAR b/bin/Linux_x86_64_static/STAR
diff --git a/bin/Linux_x86_64_static/STARlong b/bin/Linux_x86_64_static/STARlong
diff --git a/extras/doc-latex/parametersDefault.tex b/extras/doc-latex/parametersDefault.tex
@@ -18,6 +18,13 @@
 \optName{runMode}
   \optValue{alignReads}
   \optLine{string: type of the run.} 
+\begin{optOptTable}
+  \optOpt{alignReads}   \optOptLine{map reads}
+  \optOpt{genomeGenerate}   \optOptLine{generate genome files}
+  \optOpt{inputAlignmentsFromBAM}   \optOptLine{input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options.}
+  \optOpt{liftOver}   \optOptLine{lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles.}
+  \optOpt{soloCellFiltering  {\textless}/path/to/raw/count/dir/{\textgreater}   {\textless}/path/to/output/prefix{\textgreater}}   \optOptLine{STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix}
+\end{optOptTable}
 \optName{runThreadN}
   \optValue{1}
   \optLine{int: number of threads to run STAR} 
@@ -357,7 +364,8 @@
   \optLine{***STARsolo:} 
 \begin{optOptTable}
   \optOpt{CR CY UR UY}   \optOptLine{sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.}
-  \optOpt{GX GN}   \optOptLine{gene ID and gene name.}
+  \optOpt{GX GN}   \optOptLine{gene ID and gene name for unique-gene reads.}
+  \optOpt{gx gn}   \optOptLine{gene IDs and gene names for unique- and multi-gene reads.}
   \optOpt{CB UB}   \optOptLine{error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.}
   \optOpt{sM}   \optOptLine{assessment of CB and UMI.}
   \optOpt{sS}   \optOptLine{sequence of the entire barcode (CB,UMI,adapter).}
@@ -879,7 +887,7 @@
   \optLine{string(s): type of single-cell RNA-seq} 
 \begin{optOptTable}
   \optOpt{CB{\textunderscore}UMI{\textunderscore}Simple}   \optOptLine{(a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.}
-  \optOpt{CB{\textunderscore}UMI{\textunderscore}Complex}   \optOptLine{one UMI of fixed length, but multiple Cell Barcodes of varying length, as well as adapters sequences are allowed in read2 only, e.g. inDrop.}
+  \optOpt{CB{\textunderscore}UMI{\textunderscore}Complex}   \optOptLine{multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).}
   \optOpt{CB{\textunderscore}samTagOut}   \optOptLine{output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA{\textunderscore}read1 [cDNA{\textunderscore}read2 if paired-end] CellBarcode{\textunderscore}read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]}
   \optOpt{SmartSeq}   \optOptLine{Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)}
 \end{optOptTable}
@@ -933,7 +941,7 @@
   \optLine{--soloCBposition  3{\textunderscore}9{\textunderscore}3{\textunderscore}14} 
 \optName{soloAdapterSequence}
   \optValue{-}
-  \optLine{string:                 adapter sequence to anchor barcodes.} 
+  \optLine{string:                 adapter sequence to anchor barcodes. Only one adapter sequence is allowed.} 
 \optName{soloAdapterMismatchesNmax}
   \optValue{1}
   \optLine{int{\textgreater}0:                  maximum number of mismatches allowed in adapter sequence.} 
@@ -949,6 +957,7 @@
 \begin{optOptTable}
   \optOpt{1MM{\textunderscore}multi{\textunderscore}pseudocounts}   \optOptLine{same as 1MM{\textunderscore}Multi, but pseudocounts of 1 are added to all whitelist barcodes.}
   \optOpt{1MM{\textunderscore}multi{\textunderscore}Nbase{\textunderscore}pseudocounts}   \optOptLine{same as 1MM{\textunderscore}multi{\textunderscore}pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger {\textgreater}= 3.0.0}
+  \optOpt{ParseBio{\textunderscore}ED3}   \optOptLine{allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB{\textunderscore}UMI{\textunderscore}Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.}
 \end{optOptTable}
 \optName{soloInputSAMattrBarcodeSeq}
   \optValue{-}
@@ -974,16 +983,19 @@
 \begin{optOptTable}
   \optOpt{Gene}   \optOptLine{genes: reads match the gene transcript}
   \optOpt{SJ}   \optOptLine{splice junctions: reported in SJ.out.tab}
-  \optOpt{GeneFull}   \optOptLine{full genes: count all reads overlapping genes' exons and introns}
+  \optOpt{GeneFull}   \optOptLine{full gene (pre-mRNA): count all reads overlapping genes' exons and introns}
+  \optOpt{GeneFull{\textunderscore}ExonOverIntron}   \optOptLine{full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons}
+  \optOpt{GeneFull{\textunderscore}Ex50pAS}   \optOptLine{full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize {\textgreater}50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.}
 \end{optOptTable}
 \optName{soloMultiMappers}
   \optValue{Unique}
   \optLine{string(s): counting method for reads mapping to multiple genes           } 
 \begin{optOptTable}
   \optOpt{Unique}   \optOptLine{count only reads that map to unique genes}
   \optOpt{Uniform}   \optOptLine{uniformly distribute multi-genic UMIs to all genes}
-  \optOpt{Rescue}   \optOptLine{distribute UMIs proportionally to unique+uniform counts (~ first iteartion of EM)}
+  \optOpt{Rescue}   \optOptLine{distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)}
   \optOpt{PropUnique}   \optOptLine{distribute UMIs proportionally to unique mappers, if present, and uniformly if not.}
+  \optOpt{EM}   \optOptLine{multi-gene UMIs are distributed using Expectation Maximization algorithm}
 \end{optOptTable}
 \optName{soloUMIdedup}
   \optValue{1MM{\textunderscore}All}
diff --git a/source/ReadAlign_alignBAM.cpp b/source/ReadAlign_alignBAM.cpp
@@ -99,8 +99,10 @@ int ReadAlign::alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint
     int32 gxgnGene = -1; //gene to output in GX/GN
     if (P.outSAMattrPresent.GX || P.outSAMattrPresent.GN) {
         auto annFeat = readAnnot.annotFeatures[P.pSolo.samAttrFeature];
-        if (annFeat.fSet.size()==1 && annFeat.fAlign[iTrOut].size()==1)
+        if (annFeat.fSet.size()==1 && iTrOut < annFeat.fAlign.size() && annFeat.fAlign[iTrOut].size()==1) {
+            //                        2nd condition needed when this function is called to output transcriptomic alignments, where GX/GN are not allowed, and iTrOut can be large
             gxgnGene = *annFeat.fAlign[iTrOut].begin();
+        };
     };
 
     for (uint imate=0;imate < (alignType<0 ? nMates:P.readNmates);imate++) { //not readNends: this is about alignments
diff --git a/source/ReadAlign_maxMappableLength2strands.cpp b/source/ReadAlign_maxMappableLength2strands.cpp
@@ -49,27 +49,35 @@ uint ReadAlign::maxMappableLength2strands(uint pieceStartIn, uint pieceLengthIn,
         };
 
         // define upper bound for suffix array range search.
+        bool iSA2good = true;
         if (mapGen.genomeSAindexStart[Lind-1]+ind1+1 < mapGen.genomeSAindexStart[Lind]) {//we are not at the end of the SA
-            iSA2=((mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1+1] & mapGen.SAiMarkNmask) & mapGen.SAiMarkAbsentMask) - 1;
+            iSA2 = mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1+1];
+            if ( (iSA2 & mapGen.SAiMarkAbsentMaskC) == 0) {
+                iSA2 = (iSA2 & mapGen.SAiMarkNmask) - 1;
+            } else {
+                iSA2 = mapGen.nSA-1; //safe, but can probably do better
+                iSA2good = false;
+            };
         } else {
             iSA2=mapGen.nSA-1;
+            iSA2good = false;
         };
 
-
     //#define SA_SEARCH_FULL
 
     #ifdef SA_SEARCH_FULL
         //full search of the array even if the index search gave maxL
         maxL=0;
         Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd);
     #else
-        if (Lind < P.pGe.gSAindexNbases && (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no need for SA search
+        bool iSA1noN = (iSA1 & mapGen.SAiMarkNmaskC)==0;
+        if (Lind < P.pGe.gSAindexNbases && iSA1noN && iSA2good) {//no need for SA search
             // very short seq, already found hits in suffix array w/o having to search the genome for extensions.
             indStartEnd[0]=iSA1;
             indStartEnd[1]=iSA2;
             Nrep=indStartEnd[1]-indStartEnd[0]+1;
             maxL=Lind;
-        } else if (iSA1==iSA2) {//unique align already, just find maxL
+        } else if (iSA1==iSA2 && iSA1noN && iSA2good) {//unique align already, just find maxL
             if ((iSA1 & mapGen.SAiMarkNmaskC)!=0) {
                 ostringstream errOut;
                 errOut  << "BUG: in ReadAlign::maxMappableLength2strands";
@@ -79,12 +87,12 @@ uint ReadAlign::maxMappableLength2strands(uint pieceStartIn, uint pieceLengthIn,
             Nrep=1;
             bool comparRes;
             maxL=compareSeqToGenome(mapGen, Read1, pieceStart, pieceLength, Lind, iSA1, dirR, comparRes);
-        } else {//SA search, pieceLength>maxL
-            if ( (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no N in the prefix
-                maxL=Lind;
+        } else {//need SA search, pieceLength>maxL
+            if (iSA2good && iSA1noN) {
+                maxL = Lind; //Lind bases were already matched
             } else {
                 maxL=0;
-            };
+            };        
             Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd);
         };
     #endif
diff --git a/source/SoloReadBarcodeStats.h b/source/SoloReadBarcodeStats.h
@@ -5,11 +5,11 @@
 class SoloReadBarcodeStats {
 public:
     vector<string> names;
-    enum {      noNoAdapter,  noNoUMI,    nNoCB,   noNinCB,   noNinUMI,   noUMIhomopolymer,  noNoWLmatch,   noTooManyMM,   noTooManyWLmatches,   yesWLmatchExact,   yesWLmatchWithMM,  nStats};
+    enum {      noNoAdapter,  noNoUMI,    noNoCB,   noNinCB,   noNinUMI,   noUMIhomopolymer,  noNoWLmatch,   noTooManyMM,   noTooManyWLmatches,   yesWLmatchExact,   yesOneWLmatchWithMM,   yesMultWLmatchWithMM, nStats};
     uint64 V[nStats];    
     SoloReadBarcodeStats() 
     {
-        names={"noNoAdapter", "noNoUMI", "nNoCB", "noNinCB", "noNinUMI", "noUMIhomopolymer","noNoWLmatch", "noTooManyMM", "noTooManyWLmatches", "yesWLmatchExact", "yesWLmatchWithMM"};
+        names={"noNoAdapter", "noNoUMI", "noNoCB", "noNinCB", "noNinUMI", "noUMIhomopolymer","noNoWLmatch", "noTooManyMM", "noTooManyWLmatches", "yesWLmatchExact", "yesOneWLmatchWithMM", "yesMultWLmatchWithMM"};
         for (uint32 ii=0; ii<nStats; ii++)
             V[ii]=0;
     };
diff --git a/source/SoloReadBarcode_getCBandUMI.cpp b/source/SoloReadBarcode_getCBandUMI.cpp
@@ -82,7 +82,6 @@ void SoloReadBarcode::matchCBtoWL(string &cbSeq1, string &cbQual1, vector<uint64
     } else if (cbMatch1==1) {//1 match, no need to record the quality
         cbMatchString1 = to_string(cbMatchInd1[0]);
     } else if (!pSolo.CBmatchWL.mm1_multi) {//>1 matches, but this is not allowed
-        //stats.V[stats.noTooManyWLmatches]++;
         cbMatch1=-3;
         cbMatchInd1.clear();
         cbMatchString1="";
@@ -93,20 +92,18 @@ void SoloReadBarcode::addStats(const int32 cbMatch1)
 {
     if (!pSolo.cbWLyes) //no stats if no WL
         return;
-    
-    if (cbMatch1>1) {
-        stats.V[stats.noTooManyWLmatches]++;
-        return;
-    };
-    
+       
     switch (cbMatch1) {
-        case 0:
+        case 0://exact matches
             cbReadCountExact[cbMatchInd[0]]++;//note that this simply counts reads per exact CB, no checks of genes or UMIs
             stats.V[stats.yesWLmatchExact]++;
             break;
-        case 1:
-            stats.V[stats.yesWLmatchWithMM]++;
+        case 1: //one WL match counted here, but they may still get rejected in SoloReadFeature_inputRecords.cpp
+            stats.V[stats.yesOneWLmatchWithMM]++;
             break;
+        default: //multiple WL matches are counted here, but they may still get rejected in SoloReadFeature_inputRecords.cpp
+            stats.V[stats.yesMultWLmatchWithMM]++;
+            break;            
         case -1 :
             stats.V[stats.noNoWLmatch]++;
             break;
@@ -117,7 +114,7 @@ void SoloReadBarcode::addStats(const int32 cbMatch1)
             stats.V[stats.noTooManyWLmatches]++;
             break;
         case -11 :            
-            stats.V[stats.nNoCB]++;//CB sequence cannot be extracted
+            stats.V[stats.noNoCB]++;//CB sequence cannot be extracted
             break;
         case -12 :
             stats.V[stats.noTooManyMM]++;            
diff --git a/source/SoloReadFeatureStats.h b/source/SoloReadFeatureStats.h
@@ -5,18 +5,18 @@
 class SoloReadFeatureStats {
 public:
     vector<string> names;
-    enum {      noUnmapped,  noNoFeature,  noTooManyWLmatches,  noNoExactWLmatch,  yesWLmatch,  yessubWLmatchExact, yesWLmatch_UniqueFeature,  yesWLmatch_MultiFeature,  yessubWLmatch_MultiFeatureMultiGenomic,  yesCellBarcodes,  yesUMIs, nStats};
+    enum {      noUnmapped,  noNoFeature,  noTooManyWLmatches,  noMMtoWLwithoutExact,  yesWLmatch,  yessubWLmatchExact, yesWLmatch_UniqueFeature,  yesWLmatch_MultiFeature,  yessubWLmatch_MultiFeatureMultiGenomic,  yesCellBarcodes,  yesUMIs, nStats};
     uint64 V[nStats];    
     SoloReadFeatureStats() 
     {
-        names={"noUnmapped","noNoFeature","noTooManyWLmatches","noNoExactWLmatch","yesWLmatch","yessubWLmatchExact","yesWLmatch_UniqueFeature","yesWLmatch_MultiFeature","yessubWLmatch_MultiFeatureMultiGenomic","yesCellBarcodes","yesUMIs"};
+        names={"noUnmapped","noNoFeature","noTooManyWLmatches","noMMtoWLwithoutExact","yesWLmatch","yessubWLmatchExact","yesWLmatch_UniqueFeature","yesWLmatch_MultiFeature","yessubWLmatch_MultiFeatureMultiGenomic","yesCellBarcodes","yesUMIs"};
         for (uint32 ii=0; ii<nStats; ii++)
             V[ii]=0;
     };
     
     uint64 numInvalidBarcodes()
     {
-        return V[noTooManyWLmatches]+V[noNoExactWLmatch];
+        return V[noTooManyWLmatches]+V[noMMtoWLwithoutExact];
     };
     
     uint64 numMappedToTranscriptome()
diff --git a/source/SoloReadFeature_inputRecords.cpp b/source/SoloReadFeature_inputRecords.cpp
@@ -27,7 +27,7 @@ void SoloReadFeature::inputRecords(uint32 **cbP, uint32 cbPstride, vector<uint32
             *streamReads >> cb;
 
             if ( pSolo.CBmatchWL.oneExact && cbmatch==1 && cbReadCountTotal[cb]==0 && feature!=(uint32)(-1) ) {//single 1MM match, no exact matches to this CB
-                stats.V[stats.noNoExactWLmatch]++;
+                stats.V[stats.noMMtoWLwithoutExact]++;
                 continue;
             };
 
diff --git a/source/Transcriptome.cpp b/source/Transcriptome.cpp
@@ -153,7 +153,7 @@ void Transcriptome::quantsOutput() {
     ofstream qOut(P.quant.geCount.outFile);
     qOut << "N_unmapped";
     for (int itype=0; itype<quants->geneCounts.nType; itype++) {
-        qOut << "\t" <<g_statsAll.unmappedAll;
+        qOut << "\t" <<g_statsAll.unmappedMismatch + g_statsAll.unmappedShort + g_statsAll.unmappedOther + g_statsAll.unmappedMulti;
     };
     qOut << "\n";
 
diff --git a/source/VERSION b/source/VERSION
@@ -1 +1 @@
-#define STAR_VERSION "dev_EoI_2.7.9a_2021-09-10"
+#define STAR_VERSION "dev_EoI_2.7.9a_2021-09-30"
diff --git a/source/parametersDefault b/source/parametersDefault
@@ -14,10 +14,9 @@ sysShell            -
 ### Run Parameters
 runMode                         alignReads
     string: type of the run.
-
                                 alignReads             ... map reads
                                 genomeGenerate         ... generate genome files
-                                inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates.
+                                inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options.
                                 liftOver               ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles.
                                 soloCellFiltering  </path/to/raw/count/dir/>   </path/to/output/prefix>    ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix
 
diff --git a/source/parametersDefault.xxd b/source/parametersDefault.xxd

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+* Issue #1223: fixed the N_unmapped value reported in ReadsPerGene.out.tab. The single-end (i.e. partially mapped alignment are not excluded from N_unmapped.`
	`2`	`+* Issues #535, #1350: fixed a long-standing problem that resulted in a seg-fault whem mapping to the rabbit genome.`
`1`	`3`	`* Issue #1316: fixed the seg-fault which occurred if --soloType CB_samTagOut and --soloCBwhitelist None are used together.`
`2`	`4`	`* Changed Solo summary statistics outputs in Barcodes.stats and Features.stats files.`
`3`	`5`	`* Implemented --soloCBmatchWLtype ParseBio_ED3 to allow multiple mismatches and one insertion+deletion for --soloType CB_UMI_Complex.`