Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Illumina Complete Long Read presets #1069

Merged
merged 2 commits into from
Jun 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
```sh
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
```
The difference between `map-pb` and `map-ont` is that `map-pb` uses
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
Nanopore reads.
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
accounts for the low overall error rate in the reads, with transversion errors
being less frequent than transitions.

#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads

Expand Down
16 changes: 14 additions & 2 deletions align.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
mat[(m - 1) * m + j] = sc_ambi;
}

static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
{
assert(m==5);
ksw_gen_simple_mat(m,mat,a,b,sc_ambi);
transition = transition > 0? -transition : transition;
mat[0*m+2]=transition; // A->G
mat[1*m+3]=transition; // C->T
mat[2*m+0]=transition; // G->A
mat[3*m+1]=transition; // T->C
}

static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
{
uint32_t i;
Expand Down Expand Up @@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
fputc('\n', stderr);
}
if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC;
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
ksw_reset_extz(ez);
ez->zdropped = 1;
Expand Down Expand Up @@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int

r2->cnt = 0;
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
bw = (int)(opt->bw * 1.5 + 1.);
bw_long = (int)(opt->bw_long * 1.5 + 1.);
if (bw_long < bw) bw_long = bw;
Expand Down Expand Up @@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;

ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
tseq = (uint8_t*)kmalloc(km, tl);
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
Expand Down
5 changes: 3 additions & 2 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const

int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
Expand Down Expand Up @@ -178,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
else if (c == 'A') opt.a = atoi(o.arg);
else if (c == 'B') opt.b = atoi(o.arg);
else if (c == 'b') opt.transition = atoi(o.arg);
else if (c == 's') opt.min_dp_max = atoi(o.arg);
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
Expand Down Expand Up @@ -367,7 +368,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n");
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
Expand Down
1 change: 1 addition & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ typedef struct {
float alt_drop;

int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int transition; // transition mismatch score (A:G, C:T)
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
Expand Down
9 changes: 9 additions & 0 deletions options.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->alt_drop = 0.15f;

opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->transition = opt->b;
opt->sc_ambi = 1;
opt->zdrop = 400, opt->zdrop_inv = 200;
opt->end_bonus = -1;
Expand Down Expand Up @@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->occ_dist = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
io->flag = 0, io->k = 15;
mo->b = 6, mo->transition = 1;
mo->q = 10, mo->q2 = 50;
} else if (strcmp(preset, "map-iclr") == 0) {
io->flag = 0, io->k = 19;
mo->b = 6, mo->transition = 4;
mo->q = 10, mo->q2 = 50;
} else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->bw = 1000, mo->bw_long = 100000;
Expand Down