Skip to content

Commit

Permalink
bio/biosimd: unoptimized FillFastqRecordBodyFromNibbles
Browse files Browse the repository at this point in the history
Summary:
This adds a dedicated function for unpacking 4-bit base+qual data into
a FASTQ record body, with a basic implementation.  Next diff in this
series will add a SSE4.2 backend.

Ref T20646

Test Plan: Unit test included.

Reviewers: kshashidhar, ayip

Reviewed By: kshashidhar

Subscribers: ysaito, cchang

Maniphest Tasks: T20646

Differential Revision: https://phabricator.grailbio.com/D30430

fbshipit-source-id: 30b18d2
  • Loading branch information
Christopher Chang authored and Yaz Saito committed Jun 28, 2019
1 parent 6a6d98d commit 8e30180
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 0 deletions.
48 changes: 48 additions & 0 deletions biosimd/fastq_amd64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright 2019 GRAIL, Inc. All rights reserved.
// Use of this source code is governed by the Apache-2.0
// license that can be found in the LICENSE file.

// +build amd64,!appengine

package biosimd

// FillFastqRecordBodyFromNibbles fills the body (defined as the last three
// lines) of a 4-line FASTQ record, given a packed 4-bit representation of the
// base+qual information and the decoding tables.
// - dst must be a byte slice of exactly the right size to fit the three lines;
// the raw read-length is inferred as (len(dst)-4) >> 1. (Windows
// line-breaks are not supported.)
// - Length of src is validated
// - This is designed for read-length >= 32. It still produces the correct
// result for smaller lengths, but there is a fairly simple faster algorithm
// (using a pair of 256-element uint16 lookup tables and encoding/binary's
// binary.LittleEndian.PutUint16() function) for that case, which is being
// omitted for now due to irrelevance for our current use cases.
func FillFastqRecordBodyFromNibbles(dst, src []byte, baseTablePtr, qualTablePtr *NibbleLookupTable) {
readLen := (len(dst) - 4) >> 1
nSrcFullByte := readLen >> 1
srcOdd := readLen & 1
if len(src) != nSrcFullByte+srcOdd {
// 2x is due to each base appearing on both the base and qual lines.
// +4 is due to three line-feeds and one '+' character.
panic("FillFastqRecordBodyFromNibbles() requires len(src) = (<# of bases> + 1) / 2, and len(dst) = 2 * <# of bases> + 4.")
}
// TODO: fast path for readLen >= 32
quals := dst[readLen+3 : 2*readLen+3]
for srcPos := 0; srcPos != nSrcFullByte; srcPos++ {
srcByte := src[srcPos]
srcLowBits := srcByte & 15
dst[2*srcPos] = baseTablePtr.Get(srcLowBits)
quals[2*srcPos] = qualTablePtr.Get(srcLowBits)
srcHighBits := srcByte >> 4
dst[2*srcPos+1] = baseTablePtr.Get(srcHighBits)
quals[2*srcPos+1] = qualTablePtr.Get(srcHighBits)
}
if srcOdd == 1 {
srcLowBits := src[nSrcFullByte] & 15
dst[2*nSrcFullByte] = baseTablePtr.Get(srcLowBits)
quals[2*nSrcFullByte] = qualTablePtr.Get(srcLowBits)
}
copy(dst[readLen:readLen+3], "\n+\n")
dst[2*readLen+3] = '\n'
}
47 changes: 47 additions & 0 deletions biosimd/fastq_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright 2019 GRAIL, Inc. All rights reserved.
// Use of this source code is governed by the Apache-2.0
// license that can be found in the LICENSE file.

// +build !amd64 appengine

package biosimd

// FillFastqRecordBodyFromNibbles fills the body (defined as the last three
// lines) of a 4-line FASTQ record, given a packed 4-bit representation of the
// base+qual information and the decoding tables.
// - dst must be a byte slice of exactly the right size to fit the three lines;
// the raw read-length is inferred as (len(dst)-4) >> 1. (Windows
// line-breaks are not supported.)
// - Length of src is validated
// - This is designed for read-length >= 32. It still produces the correct
// result for smaller lengths, but there is a fairly simple faster algorithm
// (using a pair of 256-element uint16 lookup tables and encoding/binary's
// binary.LittleEndian.PutUint16() function) for that case, which is being
// omitted for now due to irrelevance for our current use cases.
func FillFastqRecordBodyFromNibbles(dst, src []byte, baseTablePtr, qualTablePtr *NibbleLookupTable) {
readLen := (len(dst) - 4) >> 1
nSrcFullByte := readLen >> 1
srcOdd := readLen & 1
if len(src) != nSrcFullByte+srcOdd {
// 2x is due to each base appearing on both the base and qual lines.
// +4 is due to three line-feeds and one '+' character.
panic("FillFastqRecordBodyFromNibbles() requires len(src) = (<# of bases> + 1) / 2, and len(dst) = 2 * <# of bases> + 4.")
}
quals := dst[readLen+3 : 2*readLen+3]
for srcPos := 0; srcPos != nSrcFullByte; srcPos++ {
srcByte := src[srcPos]
srcLowBits := srcByte & 15
dst[2*srcPos] = baseTablePtr.Get(srcLowBits)
quals[2*srcPos] = qualTablePtr.Get(srcLowBits)
srcHighBits := srcByte >> 4
dst[2*srcPos+1] = baseTablePtr.Get(srcHighBits)
quals[2*srcPos+1] = qualTablePtr.Get(srcHighBits)
}
if srcOdd == 1 {
srcLowBits := src[nSrcFullByte] & 15
dst[2*nSrcFullByte] = baseTablePtr.Get(srcLowBits)
quals[2*nSrcFullByte] = qualTablePtr.Get(srcLowBits)
}
copy(dst[readLen:readLen+3], "\n+\n")
dst[2*readLen+3] = '\n'
}
121 changes: 121 additions & 0 deletions biosimd/fastq_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2018 GRAIL, Inc. All rights reserved.
// Use of this source code is governed by the Apache-2.0
// license that can be found in the LICENSE file.

package biosimd_test

import (
"fmt"
"testing"

"github.com/grailbio/base/simd"
"github.com/grailbio/bio/biosimd"
"github.com/grailbio/testutil/assert"
)

var baseTable = simd.MakeNibbleLookupTable([16]byte{
'N', '?', '?', '?', 'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'})
var qualTable = simd.MakeNibbleLookupTable([16]byte{
'#', '#', '#', '#', ',', ',', ',', ',', ':', ':', ':', ':', 'F', 'F', 'F', 'F'})

var asciiToBaseBitsTable = [...]byte{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

// Not optimized. Might want to add an optimized version of this to the main
// package later.
func packAsciiBasesAndQuals(bases, quals, qualEncoding []byte) (dst []byte, err error) {
nBase := len(bases)
if nBase != len(quals) {
err = fmt.Errorf("packAsciiBasesAndQuals: inconsistent len(bases) and len(quals)")
return
}
if len(qualEncoding) != 4 {
err = fmt.Errorf("packAsciiBasesAndQuals: unexpected len(qualEncoding)")
return
}

var asciiToBase16Qual [256]byte
for i := range asciiToBase16Qual {
// Use this value to indicate "unexpected qual".
asciiToBase16Qual[i] = 255
}
for i, q := range qualEncoding {
// left-shift 2 since qual is stored in the high 2 bits of each nibble
asciiToBase16Qual[q+33] = byte(i << 2)
}

dst = make([]byte, (nBase+1)/2)
for i := 0; i != nBase; i++ {
packedBaseAndQual := asciiToBase16Qual[quals[i]]
if packedBaseAndQual == 255 {
err = fmt.Errorf("packAsciiBasesAndQuals: unexpected qual value")
return
}
packedBaseAndQual |= asciiToBaseBitsTable[bases[i]]
if i&1 == 0 {
dst[i/2] = packedBaseAndQual
} else {
dst[i/2] |= packedBaseAndQual << 4
}
}
return
}

func TestFillFastqRecordBody(t *testing.T) {
qualEncoding := []byte{2, 11, 25, 37}
tests := []struct {
baseAscii string
qualAscii string
}{
{
baseAscii: "",
qualAscii: "",
},
{
baseAscii: "G",
qualAscii: ",",
},
{
baseAscii: "N",
qualAscii: "#",
},
{
baseAscii: "ACACNGGAGAGCTTTTTTACA",
qualAscii: ",,,F#FFFFFFFF::FFFFF,",
},
{
baseAscii: "CAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTAGGATTATAGGCCCCC",
qualAscii: "FFFFFFFFFFFFFFFFFFFFF,FFFFFF:FFFFFFFFFFFFFFF,,FF",
},
{
baseAscii: "AGAGTGACTCGACTACTCACCGCGAACAAAAAAAAAAAATACATAGCATCGAGCAGCTACGACACGATCGATCGATCGACTAGTCAGTCGACTCGACTGGGGTGCTAAAAAAAAAAAAAAAAAGATGTCAGCATCGATCGCGGGGGAACCC",
qualAscii: ",,,,,,,,,,,,,,,,,,,FFFFFFFFFFFFFFFFFFFFF::::::::::::::::::F:FFF:F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFF::FFFFFFFFFFF:FFFFFFFFFFF:FFFFFFFFFF",
},
}
for _, tc := range tests {
b16, err := packAsciiBasesAndQuals([]byte(tc.baseAscii), []byte(tc.qualAscii), qualEncoding)
if err != nil {
t.Fatalf("packAsciiBasesAndQuals error: %v", err)
}
readLen := len(tc.baseAscii)
dst := make([]byte, 2*readLen+4)
biosimd.FillFastqRecordBodyFromNibbles(dst, b16, &baseTable, &qualTable)
assert.EQ(t, dst[:readLen], []byte(tc.baseAscii))
assert.EQ(t, dst[readLen+3:2*readLen+3], []byte(tc.qualAscii))
}
}

0 comments on commit 8e30180

Please sign in to comment.