forked from grailbio/bio
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
bio/biosimd: unoptimized FillFastqRecordBodyFromNibbles
Summary: This adds a dedicated function for unpacking 4-bit base+qual data into a FASTQ record body, with a basic implementation. Next diff in this series will add a SSE4.2 backend. Ref T20646 Test Plan: Unit test included. Reviewers: kshashidhar, ayip Reviewed By: kshashidhar Subscribers: ysaito, cchang Maniphest Tasks: T20646 Differential Revision: https://phabricator.grailbio.com/D30430 fbshipit-source-id: 30b18d2
- Loading branch information
Christopher Chang
authored and
Yaz Saito
committed
Jun 28, 2019
1 parent
6a6d98d
commit 8e30180
Showing
3 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Copyright 2019 GRAIL, Inc. All rights reserved. | ||
// Use of this source code is governed by the Apache-2.0 | ||
// license that can be found in the LICENSE file. | ||
|
||
// +build amd64,!appengine | ||
|
||
package biosimd | ||
|
||
// FillFastqRecordBodyFromNibbles fills the body (defined as the last three | ||
// lines) of a 4-line FASTQ record, given a packed 4-bit representation of the | ||
// base+qual information and the decoding tables. | ||
// - dst must be a byte slice of exactly the right size to fit the three lines; | ||
// the raw read-length is inferred as (len(dst)-4) >> 1. (Windows | ||
// line-breaks are not supported.) | ||
// - Length of src is validated | ||
// - This is designed for read-length >= 32. It still produces the correct | ||
// result for smaller lengths, but there is a fairly simple faster algorithm | ||
// (using a pair of 256-element uint16 lookup tables and encoding/binary's | ||
// binary.LittleEndian.PutUint16() function) for that case, which is being | ||
// omitted for now due to irrelevance for our current use cases. | ||
func FillFastqRecordBodyFromNibbles(dst, src []byte, baseTablePtr, qualTablePtr *NibbleLookupTable) { | ||
readLen := (len(dst) - 4) >> 1 | ||
nSrcFullByte := readLen >> 1 | ||
srcOdd := readLen & 1 | ||
if len(src) != nSrcFullByte+srcOdd { | ||
// 2x is due to each base appearing on both the base and qual lines. | ||
// +4 is due to three line-feeds and one '+' character. | ||
panic("FillFastqRecordBodyFromNibbles() requires len(src) = (<# of bases> + 1) / 2, and len(dst) = 2 * <# of bases> + 4.") | ||
} | ||
// TODO: fast path for readLen >= 32 | ||
quals := dst[readLen+3 : 2*readLen+3] | ||
for srcPos := 0; srcPos != nSrcFullByte; srcPos++ { | ||
srcByte := src[srcPos] | ||
srcLowBits := srcByte & 15 | ||
dst[2*srcPos] = baseTablePtr.Get(srcLowBits) | ||
quals[2*srcPos] = qualTablePtr.Get(srcLowBits) | ||
srcHighBits := srcByte >> 4 | ||
dst[2*srcPos+1] = baseTablePtr.Get(srcHighBits) | ||
quals[2*srcPos+1] = qualTablePtr.Get(srcHighBits) | ||
} | ||
if srcOdd == 1 { | ||
srcLowBits := src[nSrcFullByte] & 15 | ||
dst[2*nSrcFullByte] = baseTablePtr.Get(srcLowBits) | ||
quals[2*nSrcFullByte] = qualTablePtr.Get(srcLowBits) | ||
} | ||
copy(dst[readLen:readLen+3], "\n+\n") | ||
dst[2*readLen+3] = '\n' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
// Copyright 2019 GRAIL, Inc. All rights reserved. | ||
// Use of this source code is governed by the Apache-2.0 | ||
// license that can be found in the LICENSE file. | ||
|
||
// +build !amd64 appengine | ||
|
||
package biosimd | ||
|
||
// FillFastqRecordBodyFromNibbles fills the body (defined as the last three | ||
// lines) of a 4-line FASTQ record, given a packed 4-bit representation of the | ||
// base+qual information and the decoding tables. | ||
// - dst must be a byte slice of exactly the right size to fit the three lines; | ||
// the raw read-length is inferred as (len(dst)-4) >> 1. (Windows | ||
// line-breaks are not supported.) | ||
// - Length of src is validated | ||
// - This is designed for read-length >= 32. It still produces the correct | ||
// result for smaller lengths, but there is a fairly simple faster algorithm | ||
// (using a pair of 256-element uint16 lookup tables and encoding/binary's | ||
// binary.LittleEndian.PutUint16() function) for that case, which is being | ||
// omitted for now due to irrelevance for our current use cases. | ||
func FillFastqRecordBodyFromNibbles(dst, src []byte, baseTablePtr, qualTablePtr *NibbleLookupTable) { | ||
readLen := (len(dst) - 4) >> 1 | ||
nSrcFullByte := readLen >> 1 | ||
srcOdd := readLen & 1 | ||
if len(src) != nSrcFullByte+srcOdd { | ||
// 2x is due to each base appearing on both the base and qual lines. | ||
// +4 is due to three line-feeds and one '+' character. | ||
panic("FillFastqRecordBodyFromNibbles() requires len(src) = (<# of bases> + 1) / 2, and len(dst) = 2 * <# of bases> + 4.") | ||
} | ||
quals := dst[readLen+3 : 2*readLen+3] | ||
for srcPos := 0; srcPos != nSrcFullByte; srcPos++ { | ||
srcByte := src[srcPos] | ||
srcLowBits := srcByte & 15 | ||
dst[2*srcPos] = baseTablePtr.Get(srcLowBits) | ||
quals[2*srcPos] = qualTablePtr.Get(srcLowBits) | ||
srcHighBits := srcByte >> 4 | ||
dst[2*srcPos+1] = baseTablePtr.Get(srcHighBits) | ||
quals[2*srcPos+1] = qualTablePtr.Get(srcHighBits) | ||
} | ||
if srcOdd == 1 { | ||
srcLowBits := src[nSrcFullByte] & 15 | ||
dst[2*nSrcFullByte] = baseTablePtr.Get(srcLowBits) | ||
quals[2*nSrcFullByte] = qualTablePtr.Get(srcLowBits) | ||
} | ||
copy(dst[readLen:readLen+3], "\n+\n") | ||
dst[2*readLen+3] = '\n' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
// Copyright 2018 GRAIL, Inc. All rights reserved. | ||
// Use of this source code is governed by the Apache-2.0 | ||
// license that can be found in the LICENSE file. | ||
|
||
package biosimd_test | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
|
||
"github.com/grailbio/base/simd" | ||
"github.com/grailbio/bio/biosimd" | ||
"github.com/grailbio/testutil/assert" | ||
) | ||
|
||
var baseTable = simd.MakeNibbleLookupTable([16]byte{ | ||
'N', '?', '?', '?', 'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}) | ||
var qualTable = simd.MakeNibbleLookupTable([16]byte{ | ||
'#', '#', '#', '#', ',', ',', ',', ',', ':', ':', ':', ':', 'F', 'F', 'F', 'F'}) | ||
|
||
var asciiToBaseBitsTable = [...]byte{ | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} | ||
|
||
// Not optimized. Might want to add an optimized version of this to the main | ||
// package later. | ||
func packAsciiBasesAndQuals(bases, quals, qualEncoding []byte) (dst []byte, err error) { | ||
nBase := len(bases) | ||
if nBase != len(quals) { | ||
err = fmt.Errorf("packAsciiBasesAndQuals: inconsistent len(bases) and len(quals)") | ||
return | ||
} | ||
if len(qualEncoding) != 4 { | ||
err = fmt.Errorf("packAsciiBasesAndQuals: unexpected len(qualEncoding)") | ||
return | ||
} | ||
|
||
var asciiToBase16Qual [256]byte | ||
for i := range asciiToBase16Qual { | ||
// Use this value to indicate "unexpected qual". | ||
asciiToBase16Qual[i] = 255 | ||
} | ||
for i, q := range qualEncoding { | ||
// left-shift 2 since qual is stored in the high 2 bits of each nibble | ||
asciiToBase16Qual[q+33] = byte(i << 2) | ||
} | ||
|
||
dst = make([]byte, (nBase+1)/2) | ||
for i := 0; i != nBase; i++ { | ||
packedBaseAndQual := asciiToBase16Qual[quals[i]] | ||
if packedBaseAndQual == 255 { | ||
err = fmt.Errorf("packAsciiBasesAndQuals: unexpected qual value") | ||
return | ||
} | ||
packedBaseAndQual |= asciiToBaseBitsTable[bases[i]] | ||
if i&1 == 0 { | ||
dst[i/2] = packedBaseAndQual | ||
} else { | ||
dst[i/2] |= packedBaseAndQual << 4 | ||
} | ||
} | ||
return | ||
} | ||
|
||
func TestFillFastqRecordBody(t *testing.T) { | ||
qualEncoding := []byte{2, 11, 25, 37} | ||
tests := []struct { | ||
baseAscii string | ||
qualAscii string | ||
}{ | ||
{ | ||
baseAscii: "", | ||
qualAscii: "", | ||
}, | ||
{ | ||
baseAscii: "G", | ||
qualAscii: ",", | ||
}, | ||
{ | ||
baseAscii: "N", | ||
qualAscii: "#", | ||
}, | ||
{ | ||
baseAscii: "ACACNGGAGAGCTTTTTTACA", | ||
qualAscii: ",,,F#FFFFFFFF::FFFFF,", | ||
}, | ||
{ | ||
baseAscii: "CAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTAGGATTATAGGCCCCC", | ||
qualAscii: "FFFFFFFFFFFFFFFFFFFFF,FFFFFF:FFFFFFFFFFFFFFF,,FF", | ||
}, | ||
{ | ||
baseAscii: "AGAGTGACTCGACTACTCACCGCGAACAAAAAAAAAAAATACATAGCATCGAGCAGCTACGACACGATCGATCGATCGACTAGTCAGTCGACTCGACTGGGGTGCTAAAAAAAAAAAAAAAAAGATGTCAGCATCGATCGCGGGGGAACCC", | ||
qualAscii: ",,,,,,,,,,,,,,,,,,,FFFFFFFFFFFFFFFFFFFFF::::::::::::::::::F:FFF:F:FFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFF::FFFFFFFFFFF:FFFFFFFFFFF:FFFFFFFFFF", | ||
}, | ||
} | ||
for _, tc := range tests { | ||
b16, err := packAsciiBasesAndQuals([]byte(tc.baseAscii), []byte(tc.qualAscii), qualEncoding) | ||
if err != nil { | ||
t.Fatalf("packAsciiBasesAndQuals error: %v", err) | ||
} | ||
readLen := len(tc.baseAscii) | ||
dst := make([]byte, 2*readLen+4) | ||
biosimd.FillFastqRecordBodyFromNibbles(dst, b16, &baseTable, &qualTable) | ||
assert.EQ(t, dst[:readLen], []byte(tc.baseAscii)) | ||
assert.EQ(t, dst[readLen+3:2*readLen+3], []byte(tc.qualAscii)) | ||
} | ||
} |