Skip to content

Commit

Permalink
Rearranged collapseUMIall().
Browse files Browse the repository at this point in the history
  • Loading branch information
alexdobin committed Oct 20, 2022
1 parent e1a0d39 commit 6f2442d
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 29 deletions.
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
STAR 2.7.10b --- 2022/10/18 ::: Bug-fix release.
STAR 2.7.10b --- 2022/10/20 ::: Bug-fix release.
===========================================================================
* PR #1638: Increased entropy of shmKey to avoid collisions between genomes. Many thanks to Jeff Hussmann (@jeffhussmann).
* Issue #1577: Reduced RAM usage for large STARsolo runs.
Expand Down
Binary file modified doc/STARmanual.pdf
Binary file not shown.
7 changes: 5 additions & 2 deletions source/SoloFeature.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ class SoloFeature {
vector<uint32> nGenePerCB, nGenePerCBmulti;//number of genes (with >0 UMIs) per CB
vector<uint32> nReadPerCB;//number of reads per CB. With multimappers: all aligns per CB
vector<uint32> nReadPerCBunique, nReadPerCBtotal; //number of unique and multiple reads per CB

uint32 nReadPerCBmax;

vector<double> nUMIperCBmulti;

vector<uint32> countCellGeneUMI;//sparsified matrix for the counts, each entry is: geneID count1 count2 ... countNcounts
Expand Down Expand Up @@ -89,7 +90,9 @@ class SoloFeature {

void collapseUMI(uint32 iCB, uint32 *umiArray);
void collapseUMI_CR(uint32 iCB, uint32 *umiArray);
void collapseUMIall(uint32 iCB, uint32 *umiArray);
void collapseUMIall();
void collapseUMIperCB(uint32 iCB, vector<uint32> &umiArray, vector<uint32> &gID, vector<uint32> &gReadS);

uint32 umiArrayCorrect_CR (const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr);
uint32 umiArrayCorrect_Directional(const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr, const int32 dirCountAdd);
uint32 umiArrayCorrect_Graph (const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr);
Expand Down
38 changes: 27 additions & 11 deletions source/SoloFeature_collapseUMIall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,35 @@
inline int funCompareSolo1 (const void *a, const void *b); //defined below
inline int funCompare_uint32_1_2_0 (const void *a, const void *b);

void SoloFeature::collapseUMIall(uint32 iCB, uint32 *umiArray)
void SoloFeature::collapseUMIall()
{
vector<uint32> umiArray(nReadPerCBmax*umiArrayStride);//temp array for collapsing UMI
vector<uint32> gID(min(2*featuresNumber,nReadPerCBmax)+1); //gene IDs, 2* is needed because each gene can have unique and multi-mappers
vector<uint32> gReadS(min(2*featuresNumber,nReadPerCBmax)+1); //start of gene reads TODO: allocate this array in the 2nd half of rGU

for (uint32 icb=0; icb<nCB; icb++) {//main collapse cycle

collapseUMIperCB(icb, umiArray, gID, gReadS);

readFeatSum->stats.V[readFeatSum->stats.yesUMIs] += nUMIperCB[icb];
if (nGenePerCB[icb]>0) //nGenePerCB contains only unique
++readFeatSum->stats.V[readFeatSum->stats.yesCellBarcodes];

readFeatSum->stats.V[readFeatSum->stats.yesWLmatch] += nReadPerCBtotal[icb];
readFeatSum->stats.V[readFeatSum->stats.yessubWLmatch_UniqueFeature ] += nReadPerCBunique[icb];
};
};

void SoloFeature::collapseUMIperCB(uint32 iCB, vector<uint32> &umiArray, vector<uint32> &gID, vector<uint32> &gReadS)
{

uint32 *rGU=rCBp[iCB];
uint32 rN=nReadPerCB[iCB]; //with multimappers, this is the number of all aligns, not reads

qsort(rGU,rN,rguStride*sizeof(uint32),funCompareNumbers<uint32>); //sort by gene index

uint32 gid1 = -1;//current gID
uint32 nGenes = 0, nGenesMult = 0; //number of genes
uint32 *gID = new uint32[min(2*featuresNumber,rN)+1]; //gene IDs, 2* is needed because each gene can have unique and multi-mappers
uint32 *gReadS = new uint32[min(2*featuresNumber,rN)+1]; //start of gene reads TODO: allocate this array in the 2nd half of rGU
for (uint32 iR=0; iR<rN*rguStride; iR+=rguStride) {
if (rGU[iR+rguG]!=gid1) {//record gene boundary
gReadS[nGenes]=iR;
Expand Down Expand Up @@ -121,7 +139,7 @@ void SoloFeature::collapseUMIall(uint32 iCB, uint32 *umiArray)
umiGeneMapCount0[umiArray[iu+0]][iG]+=umiArray[iu+1];//this sums read counts over UMIs that were collapsed
};

umiArrayCorrect_CR(nU0, umiArray, readInfo.size()>0, false, umiCorrected[iG]);
umiArrayCorrect_CR(nU0, umiArray.data(), readInfo.size()>0, false, umiCorrected[iG]);

for (uint64 iu=0; iu<nU0*umiArrayStride; iu+=umiArrayStride) {//just fill the umiGeneMapCount - will calculate UMI counts later
umiGeneMapCount[umiArray[iu+2]][iG]+=umiArray[iu+1];//this sums read counts over UMIs that were collapsed
Expand All @@ -140,20 +158,20 @@ void SoloFeature::collapseUMIall(uint32 iCB, uint32 *umiArray)

if (pSolo.umiDedup.yes.CR)
countCellGeneUMI[countCellGeneUMIindex[iCB+1] + pSolo.umiDedup.countInd.CR] =
umiArrayCorrect_CR(nU0, umiArray, readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::CR, true, umiCorrected[iG]);
umiArrayCorrect_CR(nU0, umiArray.data(), readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::CR, true, umiCorrected[iG]);

if (pSolo.umiDedup.yes.Directional)
countCellGeneUMI[countCellGeneUMIindex[iCB+1] + pSolo.umiDedup.countInd.Directional] =
umiArrayCorrect_Directional(nU0, umiArray, readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::Directional, true, umiCorrected[iG], 0);
umiArrayCorrect_Directional(nU0, umiArray.data(), readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::Directional, true, umiCorrected[iG], 0);

if (pSolo.umiDedup.yes.Directional_UMItools)
countCellGeneUMI[countCellGeneUMIindex[iCB+1] + pSolo.umiDedup.countInd.Directional_UMItools] =
umiArrayCorrect_Directional(nU0, umiArray, readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::Directional_UMItools, true, umiCorrected[iG], -1);
umiArrayCorrect_Directional(nU0, umiArray.data(), readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::Directional_UMItools, true, umiCorrected[iG], -1);

//this changes umiArray, so it should be last call
if (pSolo.umiDedup.yes.All)
countCellGeneUMI[countCellGeneUMIindex[iCB+1] + pSolo.umiDedup.countInd.All] =
umiArrayCorrect_Graph(nU0, umiArray, readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::All, true, umiCorrected[iG]);
umiArrayCorrect_Graph(nU0, umiArray.data(), readInfo.size()>0 && pSolo.umiDedup.typeMain==UMIdedup::typeI::All, true, umiCorrected[iG]);
};//if (nU0>0)

{//check any count>0 and finalize record for this gene
Expand Down Expand Up @@ -514,9 +532,7 @@ void SoloFeature::collapseUMIall(uint32 iCB, uint32 *umiArray)
};
};
};

delete[] gID;
delete[] gReadS;

};

////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
16 changes: 2 additions & 14 deletions source/SoloFeature_countCBgeneUMI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void SoloFeature::countCBgeneUMI()
};*/

nReadPerCB.resize(nCB);
uint32 nReadPerCBmax=0;
nReadPerCBmax=0;
for (uint32 iCB=0; iCB<nCB; iCB++) {
nReadPerCB[iCB] = (rCBpa[indCB[iCB]]-rCBp[iCB])/rguStride; //number of reads that were matched to WL, rCBpa accumulated reference to the last element+1
//for multimappers this is the number of all alignments > number of reads
Expand All @@ -87,7 +87,6 @@ void SoloFeature::countCBgeneUMI()
nUMIperCB.resize(nCB);
nGenePerCB.resize(nCB);

uint32 *umiArray = new uint32[nReadPerCBmax*umiArrayStride];//temp array for collapsing UMI
//dedup options //gene ID
countMatStride = pSolo.umiDedup.yes.N + 1;
countCellGeneUMI.resize(nReadsMapped*countMatStride/5+16); //5 is heuristic, will be resized if needed
Expand All @@ -100,24 +99,13 @@ void SoloFeature::countCBgeneUMI()
countMatMult.i.resize(nCB+1, 0);
};

for (uint32 icb=0; icb<nCB; icb++) {//main collapse cycle

collapseUMIall(icb, umiArray);

readFeatSum->stats.V[readFeatSum->stats.yesUMIs] += nUMIperCB[icb];
if (nGenePerCB[icb]>0) //nGenePerCB contains only unique
++readFeatSum->stats.V[readFeatSum->stats.yesCellBarcodes];

readFeatSum->stats.V[readFeatSum->stats.yesWLmatch] += nReadPerCBtotal[icb];
readFeatSum->stats.V[readFeatSum->stats.yessubWLmatch_UniqueFeature ] += nReadPerCBunique[icb];
};
collapseUMIall();

P.inOut->logMain << "RAM for solo feature "<< SoloFeatureTypes::Names[featureType] <<"\n"
<< linuxProcMemory() << flush;
delete[] rGeneUMI;
delete[] rCBp;
delete[] rCBpa;
delete[] umiArray;

time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished collapsing UMIs" <<endl;
Expand Down
3 changes: 2 additions & 1 deletion source/systemFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ std::string linuxProcMemory()
(str1.rfind("VmSize",0) == 0) ||
(str1.rfind("VmHWM",0) == 0) ||
(str1.rfind("VmRSS",0) == 0) ) {
outString += str1+'\n';
outString += str1+"; ";
};
};
outString += '\n';

return outString;
};

0 comments on commit 6f2442d

Please sign in to comment.