Skip to content

Commit 12beb0c

Browse files
committed
Issues #535, #1350: fixed a long-standing problem that resulted in a seg-fault whem mapping to the rabbit genome. Issue #1223: fixed the N_unmapped value reported in ReadsPerGene.out.tab. The single-end (i.e. partially mapped alignment are not excluded from N_unmapped. dev_EoI_2.7.9a_2021-09-30
1 parent 2f34c8b commit 12beb0c

16 files changed

Lines changed: 4209 additions & 4189 deletions

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
* Issue #1223: fixed the N_unmapped value reported in ReadsPerGene.out.tab. The single-end (i.e. partially mapped alignment are not excluded from N_unmapped.
2+
* Issues #535, #1350: fixed a long-standing problem that resulted in a seg-fault whem mapping to the rabbit genome.
13
* Issue #1316: fixed the seg-fault which occurred if --soloType CB_samTagOut and --soloCBwhitelist None are used together.
24
* Changed Solo summary statistics outputs in Barcodes.stats and Features.stats files.
35
* Implemented --soloCBmatchWLtype ParseBio_ED3 to allow multiple mismatches and one insertion+deletion for --soloType CB_UMI_Complex.

bin/Linux_x86_64/STAR

14.1 KB
Binary file not shown.

bin/Linux_x86_64/STARlong

18.1 KB
Binary file not shown.

bin/Linux_x86_64_static/STAR

14 KB
Binary file not shown.

bin/Linux_x86_64_static/STARlong

18 KB
Binary file not shown.

extras/doc-latex/parametersDefault.tex

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@
1818
\optName{runMode}
1919
\optValue{alignReads}
2020
\optLine{string: type of the run.}
21+
\begin{optOptTable}
22+
\optOpt{alignReads} \optOptLine{map reads}
23+
\optOpt{genomeGenerate} \optOptLine{generate genome files}
24+
\optOpt{inputAlignmentsFromBAM} \optOptLine{input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options.}
25+
\optOpt{liftOver} \optOptLine{lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles.}
26+
\optOpt{soloCellFiltering {\textless}/path/to/raw/count/dir/{\textgreater} {\textless}/path/to/output/prefix{\textgreater}} \optOptLine{STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix}
27+
\end{optOptTable}
2128
\optName{runThreadN}
2229
\optValue{1}
2330
\optLine{int: number of threads to run STAR}
@@ -357,7 +364,8 @@
357364
\optLine{***STARsolo:}
358365
\begin{optOptTable}
359366
\optOpt{CR CY UR UY} \optOptLine{sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.}
360-
\optOpt{GX GN} \optOptLine{gene ID and gene name.}
367+
\optOpt{GX GN} \optOptLine{gene ID and gene name for unique-gene reads.}
368+
\optOpt{gx gn} \optOptLine{gene IDs and gene names for unique- and multi-gene reads.}
361369
\optOpt{CB UB} \optOptLine{error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.}
362370
\optOpt{sM} \optOptLine{assessment of CB and UMI.}
363371
\optOpt{sS} \optOptLine{sequence of the entire barcode (CB,UMI,adapter).}
@@ -879,7 +887,7 @@
879887
\optLine{string(s): type of single-cell RNA-seq}
880888
\begin{optOptTable}
881889
\optOpt{CB{\textunderscore}UMI{\textunderscore}Simple} \optOptLine{(a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.}
882-
\optOpt{CB{\textunderscore}UMI{\textunderscore}Complex} \optOptLine{one UMI of fixed length, but multiple Cell Barcodes of varying length, as well as adapters sequences are allowed in read2 only, e.g. inDrop.}
890+
\optOpt{CB{\textunderscore}UMI{\textunderscore}Complex} \optOptLine{multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).}
883891
\optOpt{CB{\textunderscore}samTagOut} \optOptLine{output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA{\textunderscore}read1 [cDNA{\textunderscore}read2 if paired-end] CellBarcode{\textunderscore}read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]}
884892
\optOpt{SmartSeq} \optOptLine{Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)}
885893
\end{optOptTable}
@@ -933,7 +941,7 @@
933941
\optLine{--soloCBposition 3{\textunderscore}9{\textunderscore}3{\textunderscore}14}
934942
\optName{soloAdapterSequence}
935943
\optValue{-}
936-
\optLine{string: adapter sequence to anchor barcodes.}
944+
\optLine{string: adapter sequence to anchor barcodes. Only one adapter sequence is allowed.}
937945
\optName{soloAdapterMismatchesNmax}
938946
\optValue{1}
939947
\optLine{int{\textgreater}0: maximum number of mismatches allowed in adapter sequence.}
@@ -949,6 +957,7 @@
949957
\begin{optOptTable}
950958
\optOpt{1MM{\textunderscore}multi{\textunderscore}pseudocounts} \optOptLine{same as 1MM{\textunderscore}Multi, but pseudocounts of 1 are added to all whitelist barcodes.}
951959
\optOpt{1MM{\textunderscore}multi{\textunderscore}Nbase{\textunderscore}pseudocounts} \optOptLine{same as 1MM{\textunderscore}multi{\textunderscore}pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger {\textgreater}= 3.0.0}
960+
\optOpt{ParseBio{\textunderscore}ED3} \optOptLine{allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB{\textunderscore}UMI{\textunderscore}Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline.}
952961
\end{optOptTable}
953962
\optName{soloInputSAMattrBarcodeSeq}
954963
\optValue{-}
@@ -974,16 +983,19 @@
974983
\begin{optOptTable}
975984
\optOpt{Gene} \optOptLine{genes: reads match the gene transcript}
976985
\optOpt{SJ} \optOptLine{splice junctions: reported in SJ.out.tab}
977-
\optOpt{GeneFull} \optOptLine{full genes: count all reads overlapping genes' exons and introns}
986+
\optOpt{GeneFull} \optOptLine{full gene (pre-mRNA): count all reads overlapping genes' exons and introns}
987+
\optOpt{GeneFull{\textunderscore}ExonOverIntron} \optOptLine{full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons}
988+
\optOpt{GeneFull{\textunderscore}Ex50pAS} \optOptLine{full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize {\textgreater}50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.}
978989
\end{optOptTable}
979990
\optName{soloMultiMappers}
980991
\optValue{Unique}
981992
\optLine{string(s): counting method for reads mapping to multiple genes }
982993
\begin{optOptTable}
983994
\optOpt{Unique} \optOptLine{count only reads that map to unique genes}
984995
\optOpt{Uniform} \optOptLine{uniformly distribute multi-genic UMIs to all genes}
985-
\optOpt{Rescue} \optOptLine{distribute UMIs proportionally to unique+uniform counts (~ first iteartion of EM)}
996+
\optOpt{Rescue} \optOptLine{distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)}
986997
\optOpt{PropUnique} \optOptLine{distribute UMIs proportionally to unique mappers, if present, and uniformly if not.}
998+
\optOpt{EM} \optOptLine{multi-gene UMIs are distributed using Expectation Maximization algorithm}
987999
\end{optOptTable}
9881000
\optName{soloUMIdedup}
9891001
\optValue{1MM{\textunderscore}All}

source/ReadAlign_alignBAM.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,10 @@ int ReadAlign::alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint
9999
int32 gxgnGene = -1; //gene to output in GX/GN
100100
if (P.outSAMattrPresent.GX || P.outSAMattrPresent.GN) {
101101
auto annFeat = readAnnot.annotFeatures[P.pSolo.samAttrFeature];
102-
if (annFeat.fSet.size()==1 && annFeat.fAlign[iTrOut].size()==1)
102+
if (annFeat.fSet.size()==1 && iTrOut < annFeat.fAlign.size() && annFeat.fAlign[iTrOut].size()==1) {
103+
// 2nd condition needed when this function is called to output transcriptomic alignments, where GX/GN are not allowed, and iTrOut can be large
103104
gxgnGene = *annFeat.fAlign[iTrOut].begin();
105+
};
104106
};
105107

106108
for (uint imate=0;imate < (alignType<0 ? nMates:P.readNmates);imate++) { //not readNends: this is about alignments

source/ReadAlign_maxMappableLength2strands.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,27 +49,35 @@ uint ReadAlign::maxMappableLength2strands(uint pieceStartIn, uint pieceLengthIn,
4949
};
5050

5151
// define upper bound for suffix array range search.
52+
bool iSA2good = true;
5253
if (mapGen.genomeSAindexStart[Lind-1]+ind1+1 < mapGen.genomeSAindexStart[Lind]) {//we are not at the end of the SA
53-
iSA2=((mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1+1] & mapGen.SAiMarkNmask) & mapGen.SAiMarkAbsentMask) - 1;
54+
iSA2 = mapGen.SAi[mapGen.genomeSAindexStart[Lind-1]+ind1+1];
55+
if ( (iSA2 & mapGen.SAiMarkAbsentMaskC) == 0) {
56+
iSA2 = (iSA2 & mapGen.SAiMarkNmask) - 1;
57+
} else {
58+
iSA2 = mapGen.nSA-1; //safe, but can probably do better
59+
iSA2good = false;
60+
};
5461
} else {
5562
iSA2=mapGen.nSA-1;
63+
iSA2good = false;
5664
};
5765

58-
5966
//#define SA_SEARCH_FULL
6067

6168
#ifdef SA_SEARCH_FULL
6269
//full search of the array even if the index search gave maxL
6370
maxL=0;
6471
Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd);
6572
#else
66-
if (Lind < P.pGe.gSAindexNbases && (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no need for SA search
73+
bool iSA1noN = (iSA1 & mapGen.SAiMarkNmaskC)==0;
74+
if (Lind < P.pGe.gSAindexNbases && iSA1noN && iSA2good) {//no need for SA search
6775
// very short seq, already found hits in suffix array w/o having to search the genome for extensions.
6876
indStartEnd[0]=iSA1;
6977
indStartEnd[1]=iSA2;
7078
Nrep=indStartEnd[1]-indStartEnd[0]+1;
7179
maxL=Lind;
72-
} else if (iSA1==iSA2) {//unique align already, just find maxL
80+
} else if (iSA1==iSA2 && iSA1noN && iSA2good) {//unique align already, just find maxL
7381
if ((iSA1 & mapGen.SAiMarkNmaskC)!=0) {
7482
ostringstream errOut;
7583
errOut << "BUG: in ReadAlign::maxMappableLength2strands";
@@ -79,12 +87,12 @@ uint ReadAlign::maxMappableLength2strands(uint pieceStartIn, uint pieceLengthIn,
7987
Nrep=1;
8088
bool comparRes;
8189
maxL=compareSeqToGenome(mapGen, Read1, pieceStart, pieceLength, Lind, iSA1, dirR, comparRes);
82-
} else {//SA search, pieceLength>maxL
83-
if ( (iSA1 & mapGen.SAiMarkNmaskC)==0 ) {//no N in the prefix
84-
maxL=Lind;
90+
} else {//need SA search, pieceLength>maxL
91+
if (iSA2good && iSA1noN) {
92+
maxL = Lind; //Lind bases were already matched
8593
} else {
8694
maxL=0;
87-
};
95+
};
8896
Nrep = maxMappableLength(mapGen, Read1, pieceStart, pieceLength, iSA1 & mapGen.SAiMarkNmask, iSA2, dirR, maxL, indStartEnd);
8997
};
9098
#endif

source/SoloReadBarcodeStats.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
class SoloReadBarcodeStats {
66
public:
77
vector<string> names;
8-
enum { noNoAdapter, noNoUMI, nNoCB, noNinCB, noNinUMI, noUMIhomopolymer, noNoWLmatch, noTooManyMM, noTooManyWLmatches, yesWLmatchExact, yesWLmatchWithMM, nStats};
8+
enum { noNoAdapter, noNoUMI, noNoCB, noNinCB, noNinUMI, noUMIhomopolymer, noNoWLmatch, noTooManyMM, noTooManyWLmatches, yesWLmatchExact, yesOneWLmatchWithMM, yesMultWLmatchWithMM, nStats};
99
uint64 V[nStats];
1010
SoloReadBarcodeStats()
1111
{
12-
names={"noNoAdapter", "noNoUMI", "nNoCB", "noNinCB", "noNinUMI", "noUMIhomopolymer","noNoWLmatch", "noTooManyMM", "noTooManyWLmatches", "yesWLmatchExact", "yesWLmatchWithMM"};
12+
names={"noNoAdapter", "noNoUMI", "noNoCB", "noNinCB", "noNinUMI", "noUMIhomopolymer","noNoWLmatch", "noTooManyMM", "noTooManyWLmatches", "yesWLmatchExact", "yesOneWLmatchWithMM", "yesMultWLmatchWithMM"};
1313
for (uint32 ii=0; ii<nStats; ii++)
1414
V[ii]=0;
1515
};

source/SoloReadBarcode_getCBandUMI.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ void SoloReadBarcode::matchCBtoWL(string &cbSeq1, string &cbQual1, vector<uint64
8282
} else if (cbMatch1==1) {//1 match, no need to record the quality
8383
cbMatchString1 = to_string(cbMatchInd1[0]);
8484
} else if (!pSolo.CBmatchWL.mm1_multi) {//>1 matches, but this is not allowed
85-
//stats.V[stats.noTooManyWLmatches]++;
8685
cbMatch1=-3;
8786
cbMatchInd1.clear();
8887
cbMatchString1="";
@@ -93,20 +92,18 @@ void SoloReadBarcode::addStats(const int32 cbMatch1)
9392
{
9493
if (!pSolo.cbWLyes) //no stats if no WL
9594
return;
96-
97-
if (cbMatch1>1) {
98-
stats.V[stats.noTooManyWLmatches]++;
99-
return;
100-
};
101-
95+
10296
switch (cbMatch1) {
103-
case 0:
97+
case 0://exact matches
10498
cbReadCountExact[cbMatchInd[0]]++;//note that this simply counts reads per exact CB, no checks of genes or UMIs
10599
stats.V[stats.yesWLmatchExact]++;
106100
break;
107-
case 1:
108-
stats.V[stats.yesWLmatchWithMM]++;
101+
case 1: //one WL match counted here, but they may still get rejected in SoloReadFeature_inputRecords.cpp
102+
stats.V[stats.yesOneWLmatchWithMM]++;
109103
break;
104+
default: //multiple WL matches are counted here, but they may still get rejected in SoloReadFeature_inputRecords.cpp
105+
stats.V[stats.yesMultWLmatchWithMM]++;
106+
break;
110107
case -1 :
111108
stats.V[stats.noNoWLmatch]++;
112109
break;
@@ -117,7 +114,7 @@ void SoloReadBarcode::addStats(const int32 cbMatch1)
117114
stats.V[stats.noTooManyWLmatches]++;
118115
break;
119116
case -11 :
120-
stats.V[stats.nNoCB]++;//CB sequence cannot be extracted
117+
stats.V[stats.noNoCB]++;//CB sequence cannot be extracted
121118
break;
122119
case -12 :
123120
stats.V[stats.noTooManyMM]++;

0 commit comments

Comments
 (0)