Skip to content

Commit

Permalink
r819: mappy to support cs/MD
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Jul 25, 2018
1 parent 8c064a5 commit ff9917a
Show file tree
Hide file tree
Showing 11 changed files with 115 additions and 28 deletions.
41 changes: 32 additions & 9 deletions format.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ void mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int
free(str.s);
}

static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden)
static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
{
int i, q_off, t_off;
mm_sprintf_lite(s, "\tcs:Z:");
if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert(op >= 0 && op <= 3);
Expand Down Expand Up @@ -181,10 +181,10 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
}

static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int write_tag)
{
int i, q_off, t_off, l_MD = 0;
mm_sprintf_lite(s, "\tMD:Z:");
if (write_tag) mm_sprintf_lite(s, "\tMD:Z:");
for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported
Expand All @@ -210,7 +210,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
}

static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD)
static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag)
{
extern unsigned char seq_nt4_table[256];
int i;
Expand All @@ -230,11 +230,34 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c;
}
}
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp);
else write_cs_core(s, tseq, qseq, r, tmp, no_iden);
if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
}

int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int is_MD, int no_iden)
{
mm_bseq1_t t;
kstring_t str;
str.s = *buf, str.l = 0, str.m = *max_len;
t.l_seq = strlen(seq);
t.seq = (char*)seq;
write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0);
*max_len = str.m;
*buf = str.s;
return str.l;
}

int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
{
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 0, no_iden);
}

int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
{
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0);
}

static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
int type;
Expand Down Expand Up @@ -277,7 +300,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDNSHP=XB"[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1);
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
mm_sprintf_lite(s, "\t%s", t->comment);
}
Expand Down Expand Up @@ -476,7 +499,7 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
}
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1);
if (cigar_in_tag)
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
}
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "getopt.h"
#endif

#define MM_VERSION "2.11-r815-dirty"
#define MM_VERSION "2.11-r819-dirty"

#ifdef __linux__
#include <sys/resource.h>
Expand Down
7 changes: 6 additions & 1 deletion map.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ void mm_tbuf_destroy(mm_tbuf_t *b)
free(b);
}

void *mm_tbuf_get_km(mm_tbuf_t *b)
{
return b->km;
}

static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres)
{
int n_dreg, j, k, u = 0;
Expand Down Expand Up @@ -682,7 +687,7 @@ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_sp
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
pl.rid_shift[i] += pl.rid_shift[i - 1];
if (opt->flag & MM_F_OUT_SAM)
for (i = 0; i < pl.mi->n_seq; ++i)
for (i = 0; i < (int32_t)pl.mi->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len);

kt_pipeline(2, worker_pipeline, &pl, 3);
Expand Down
18 changes: 18 additions & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,8 @@ mm_tbuf_t *mm_tbuf_init(void);
*/
void mm_tbuf_destroy(mm_tbuf_t *b);

void *mm_tbuf_get_km(mm_tbuf_t *b);

/**
* Align a query sequence against an index
*
Expand Down Expand Up @@ -337,6 +339,22 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int

int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);

/**
* Generate the cs tag (new in 2.12)
*
* @param km memory blocks; set to NULL if unsure
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
* @param max_len max length of the buffer; typically set to 0 on the first call
* @param mi index
* @param r alignment
* @param seq query sequence
* @param no_iden true to use : instead of =
*
* @return the length of cs
*/
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);

// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
Expand Down
5 changes: 5 additions & 0 deletions options.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)

int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo)
{
if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");
return -6;
}
if (io->k <= 0 || io->w <= 0) {
if (mm_verbose >= 1)
fprintf(stderr, "[ERROR]\033[1;31m -k and -w must be positive\033[0m\n");
Expand Down
13 changes: 10 additions & 3 deletions python/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ The following Python script demonstrates the key functionality of mappy:
APIs
----

Mappy implements two classes and one global function.
Mappy implements two classes and two global function.

Class mappy.Aligner
~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -83,13 +83,15 @@ This constructor accepts the following arguments:

.. code:: python
mappy.Aligner.map(seq, seq2=None)
mappy.Aligner.map(seq, seq2=None, cs=False, MD=False)
This method aligns :code:`seq` against the index. It is a generator, *yielding*
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
performs paired-end alignment, assuming the two ends are in the FR orientation.
Alignments of the two ends can be distinguished by the :code:`read_num` field
(see Class mappy.Alignment below).
(see Class mappy.Alignment below). Argument :code:`cs` asks mappy to generate
the :code:`cs` tag; :code:`MD` is similar. These two arguments might slightly
degrade performance and are not enabled by default.

.. code:: python
Expand Down Expand Up @@ -139,6 +141,11 @@ properties:
* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
numbers give the length and the operator of each CIGAR operation.

* **MD**: the :code:`MD` tag as in the SAM format. It is an empty string unless
the :code:`MD` argument is applied when calling :code:`mappy.Aligner.map()`.

* **cs**: the :code:`cs` tag.

An :code:`Alignment` object can be converted to a string with :code:`str()` in
the following format:

Expand Down
4 changes: 2 additions & 2 deletions python/cmappy.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int e
*len = 0;
rid = mm_idx_name2id(mi, name);
if (rid < 0) return 0;
if (st >= mi->seq[rid].len || st >= en) return 0;
if (en < 0 || en > mi->seq[rid].len)
if ((uint32_t)st >= mi->seq[rid].len || st >= en) return 0;
if (en < 0 || (uint32_t)en > mi->seq[rid].len)
en = mi->seq[rid].len;
s = (char*)malloc(en - st + 1);
*len = mm_idx_getseq(mi, rid, st, en, (uint8_t*)s);
Expand Down
4 changes: 4 additions & 0 deletions python/cmappy.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ cdef extern from "minimap.h":
int32_t mid_occ
int32_t max_occ
int mini_batch_size
const char *split_prefix

int mm_set_opt(char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
int mm_verbose
Expand Down Expand Up @@ -86,6 +87,9 @@ cdef extern from "minimap.h":

mm_tbuf_t *mm_tbuf_init()
void mm_tbuf_destroy(mm_tbuf_t *b)
void *mm_tbuf_get_km(mm_tbuf_t *b)
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)

#
# Helper header (because it is hard to expose mm_reg1_t with Cython)
Expand Down
39 changes: 31 additions & 8 deletions python/mappy.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ cdef class Alignment:
cdef int8_t _strand, _trans_strand
cdef uint8_t _mapq, _is_primary
cdef int _seg_id
cdef _ctg, _cigar # these are python objects
cdef _ctg, _cigar, _cs, _MD # these are python objects

def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id):
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
self._strand, self._q_st, self._q_en = strand, qs, qe
Expand All @@ -26,6 +26,8 @@ cdef class Alignment:
self._is_primary = is_primary
self._trans_strand = trans_strand
self._seg_id = seg_id
self._cs = cs_str
self._MD = MD_str

@property
def ctg(self): return self._ctg
Expand Down Expand Up @@ -72,6 +74,12 @@ cdef class Alignment:
@property
def read_num(self): return self._seg_id + 1

@property
def cs(self): return self._cs

@property
def MD(self): return self._MD

@property
def cigar_str(self):
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
Expand All @@ -85,8 +93,10 @@ cdef class Alignment:
if self._trans_strand > 0: ts = 'ts:A:+'
elif self._trans_strand < 0: ts = 'ts:A:-'
else: ts = 'ts:A:.'
return "\t".join([str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str])
a = [str(self._q_st), str(self._q_en), strand, self._ctg, str(self._ctg_len), str(self._r_st), str(self._r_en),
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
if self._cs != "": a.append("cs:Z:" + self._cs)
return "\t".join(a)

cdef class ThreadBuffer:
cdef cmappy.mm_tbuf_t *_b
Expand Down Expand Up @@ -135,18 +145,23 @@ cdef class Aligner:
def __bool__(self):
return (self._idx != NULL)

def map(self, seq, seq2=None, buf=None, max_frag_len=None):
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None):
cdef cmappy.mm_reg1_t *regs
cdef cmappy.mm_hitpy_t h
cdef ThreadBuffer b
cdef int n_regs
cdef char *cs_str = NULL
cdef int l_cs_str, m_cs_str = 0
cdef void *km
cdef cmappy.mm_mapopt_t map_opt

map_opt = self.map_opt
if max_frag_len is not None: map_opt.max_frag_len = max_frag_len

if self._idx is NULL: return None
if buf is None: b = ThreadBuffer()
else: b = buf
km = cmappy.mm_tbuf_get_km(b._b)

_seq = seq if isinstance(seq, bytes) else seq.encode()
if seq2 is None:
Expand All @@ -157,13 +172,21 @@ cdef class Aligner:

for i in range(n_regs):
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
cigar = []
for k in range(h.n_cigar32):
cigar, _cs, _MD = [], '', ''
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
c = h.cigar32[k]
cigar.append([c>>4, c&0xf])
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id)
if cs or MD: # generate the cs and/or the MD tag, if requested
if cs:
l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq, 1)
_cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
if MD:
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq)
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
cmappy.mm_free_reg1(&regs[i])
free(regs)
free(cs_str)

def seq(self, str name, int start=0, int end=0x7fffffff):
cdef int l
Expand Down
8 changes: 5 additions & 3 deletions python/minimap2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import mappy as mp

def main(argv):
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:")
opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
if len(args) < 2:
print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
print("Options:")
Expand All @@ -14,21 +14,23 @@ def main(argv):
print(" -k INT k-mer length")
print(" -w INT minimizer window length")
print(" -r INT band width")
print(" -c output the cs tag")
sys.exit(1)

preset, min_cnt, min_sc, k, w, bw = None, None, None, None, None, None
preset, min_cnt, min_sc, k, w, bw, out_cs = None, None, None, None, None, None, False
for opt, arg in opts:
if opt == '-x': preset = arg
elif opt == '-n': min_cnt = int(arg)
elif opt == '-m': min_chain_score = int(arg)
elif opt == '-r': bw = int(arg)
elif opt == '-k': k = int(arg)
elif opt == '-w': w = int(arg)
elif opt == '-c': out_cs = True

a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
for h in a.map(seq): # traverse hits
for h in a.map(seq, cs=out_cs): # traverse hits
print('{}\t{}\t{}'.format(name, len(seq), h))

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def readme():
ext_modules = [Extension('mappy',
sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c'],
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
'python/cmappy.h', 'python/cmappy.pxd'],
Expand Down

0 comments on commit ff9917a

Please sign in to comment.