Skip to content

Commit

Permalink
Add fold_cache for cache output
Browse files Browse the repository at this point in the history
- having a cache to check combinations of i,j input is useful in the context of creating primers for PCR
  • Loading branch information
jjti committed Jan 22, 2020
1 parent 83b63f3 commit 657bf1f
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 59 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ minor: test
python3 setup.py sdist bdist_wheel
python3 -m twine upload dist/* --skip-existing

examples:
examples: install
python3 ./tests/fold_examples.py

profile: install
Expand Down
8 changes: 4 additions & 4 deletions examples/dna.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
seqfold,UNAFold,seq
-15.2,-10.94,GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC
-15.2,-10.9,GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC
-24.8,-23.4,GGGAGGTCGCTCCAGCTGGGAGGAGCGTTGGGGGTATATACCCCCAACACCGGTACTGATCCGGTGACCTCCC
-7.5,-3.8,CGCAGGGAUACCCGCG
-3.5,-6.85,TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT
-3.5,-6.9,TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT
-18.0,-15.5,GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA
-19.0,-18.1,TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA
-3.0,-3.65,ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA
-9.2,-9.35,TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC
-3.0,-3.7,ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA
-9.2,-9.4,TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC
2 changes: 1 addition & 1 deletion seqfold/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
finally:
del get_distribution, DistributionNotFound

from .fold import calc_dg, fold, Cache, Struct
from .fold import fold, fold_cache, calc_dg, traceback, Cache, Struct
96 changes: 59 additions & 37 deletions seqfold/fold.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,50 @@ def with_ij(self, ij: List[Tuple[int, int]]):
"""A map from i, j tuple to a value."""


def calc_dg(seq: str, temp: float = 37.0) -> float:
"""Fold the sequence and return just the delta G of the structure
def fold_cache(seq: str, temp: float = 37.0) -> Tuple[Cache, Cache]:
"""Fold a nucleic acid sequence and return the w_cache.
The Cache is useful for gathering many possible energies
between a series of (i,j) combinations.
Args:
seq: The sequence to fold
Keyword Args:
temp: The temperature to fold at
Returns:
float: The minimum free energy of the folded sequence
(Cache, Cache): The w_cache and the v_cache for traversal later
"""

structs = fold(seq, temp)
return round(sum(s.e for s in structs), 2)
seq = seq.upper()
temp = temp + 273.15 # kelvin

# figure out whether it's DNA or RNA, choose energy map
dna = True
bps = set(seq)
if "U" in bps and "T" in bps:
raise RuntimeError(
"Both T and U in sequence. Provide one or the other for DNA OR RNA."
)
if all(bp in "AUCG" for bp in bps):
dna = False
elif any(bp not in "ATGC" for bp in bps):
diff = [bp for bp in bps if bp not in "ATUGC"]
raise RuntimeError(f"Unknown bp: {diff}. Only DNA/RNA foldable")
emap = DNA_ENERGIES if dna else RNA_ENERGIES

n = len(seq)
v_cache: Cache = []
w_cache: Cache = []
for _ in range(n):
v_cache.append([STRUCT_DEFAULT] * n)
w_cache.append([STRUCT_DEFAULT] * n)

# fill the cache
_w(seq, 0, n - 1, temp, v_cache, w_cache, emap)

return v_cache, w_cache


def fold(seq: str, temp: float = 37.0) -> List[Struct]:
Expand All @@ -83,35 +112,28 @@ def fold(seq: str, temp: float = 37.0) -> List[Struct]:
List[Struct]: A list of structures. Stacks, bulges, hairpins, etc.
"""

seq = seq.upper()
temp = temp + 273.15 # kelvin
v_cache, w_cache = fold_cache(seq, temp)
n = len(seq)

# figure out whether it's DNA or RNA, choose energy map
dna = True
bps = set(seq)
if "U" in bps and "T" in bps:
raise RuntimeError(
"Both T and U in sequence. Provide one or the other for DNA OR RNA."
)
if all(bp in "AUCG" for bp in bps):
dna = False
elif any(bp not in "ATGC" for bp in bps):
diff = [bp for bp in bps if bp not in "ATUGC"]
raise RuntimeError(f"Unknown bp: {diff}. Only DNA/RNA foldable")
emap = DNA_ENERGIES if dna else RNA_ENERGIES
# get the minimum free energy structure out of the cache
return traceback(0, n - 1, v_cache, w_cache)

n = len(seq)
v_cache: Cache = []
w_cache: Cache = []
for _ in range(n):
v_cache.append([STRUCT_DEFAULT] * n)
w_cache.append([STRUCT_DEFAULT] * n)

# fill the cache
_w(seq, 0, n - 1, temp, v_cache, w_cache, emap)
def calc_dg(seq: str, temp: float = 37.0) -> float:
"""Fold the sequence and return just the delta G of the structure
Args:
seq: The sequence to fold
Keyword Args:
temp: The temperature to fold at
# get the structure out of the cache
return _traceback(0, n - 1, v_cache, w_cache)
Returns:
float: The minimum free energy of the folded sequence
"""

structs = fold(seq, temp)
return round(sum(s.e for s in structs), 2)


def _w(
Expand Down Expand Up @@ -738,7 +760,7 @@ def add_branch(s: Struct):
return Struct(e, f"BIFURCATION:{str(unpaired)}n/{str(branches_count)}h", branches)


def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]:
def traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]:
"""Traceback thru the V(i,j) and W(i,j) caches to find the structure
For each step, get to the lowest energy W(i,j) within that block
Expand All @@ -750,7 +772,7 @@ def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]:
Args:
i: The leftmost index to start searching in
j: The rightmost index to start searching in
v_cache: Energies/structures where i and j bond
v_cache: Energies where i and j bond
w_cache: Energies/sub-structures between or with i and j
Returns:
Expand Down Expand Up @@ -786,11 +808,11 @@ def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]:
structs = _trackback_energy(structs)
branches: List[Struct] = []
for i1, j1 in s.ij:
traceback = _traceback(i1, j1, v_cache, w_cache)
if traceback and traceback[0].ij:
i2, j2 = traceback[0].ij[0]
tb = traceback(i1, j1, v_cache, w_cache)
if tb and tb[0].ij:
i2, j2 = tb[0].ij[0]
e_sum += w_cache[i2][j2].e
branches += traceback
branches += tb

last = structs[-1]
structs[-1] = Struct(round(last.e - e_sum, 1), last.desc, list(last.ij))
Expand Down
12 changes: 6 additions & 6 deletions tests/fold_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
DIR = path.dirname(path.realpath(__file__))

DNA_EXAMPLES = {
"GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC": -10.94, # three branched structure
"GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC": -10.9, # three branched structure
"GGGAGGTCGCTCCAGCTGGGAGGAGCGTTGGGGGTATATACCCCCAACACCGGTACTGATCCGGTGACCTCCC": -23.4, # four branched structure
"CGCAGGGAUACCCGCG": -3.8,
"TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT": -6.85,
"GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA": -15.50,
"TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA": -18.10,
"ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA": -3.65,
"TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC": -9.35,
"TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT": -6.9,
"GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA": -15.5,
"TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA": -18.1,
"ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA": -3.7,
"TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC": -9.4,
}

# writing results to examples for comparison
Expand Down
24 changes: 14 additions & 10 deletions tests/fold_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,27 @@
from time import time
import unittest

from seqfold import calc_dg
from seqfold import calc_dg, fold, fold_cache, traceback, Cache, Struct
from seqfold.dna import DNA_ENERGIES
from seqfold.rna import RNA_ENERGIES
from seqfold.fold import (
fold,
STRUCT_DEFAULT,
Cache,
_bulge,
_stack,
_hairpin,
_internal_loop,
_pair,
_w,
_traceback,
)


class TestFold(unittest.TestCase):
"""Test folding functions"""

def test_fold(self):
"""Test fold function."""
"""Fold function."""

# it should throw if a non-sense sequence is provided
# it should throw if a nonsense sequence is provided
with self.assertRaises(RuntimeError):
calc_dg("EASFEASFAST", 37.0)

Expand All @@ -38,6 +35,17 @@ def test_fold(self):
# should not throw
calc_dg("ATGGATTTAGATAGAT")

def test_fold_cache(self):
"""Gather a cache of the folded structure."""

seq = "ATGGATTTAGATAGAT"
v_cache, w_cache = fold_cache(seq)
seq_dg = calc_dg(seq)

self.assertEqual(
seq_dg, sum(s.e for s in traceback(0, len(seq) - 1, v_cache, w_cache))
)

def test_fold_dna(self):
"""DNA folding to find min energy secondary structure."""

Expand Down Expand Up @@ -88,7 +96,6 @@ def test_multibranch(self):
seq = "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC" # three branch

structs = fold(seq)

self.assertTrue(
any("BIFURCATION" in s.desc and (7, 41) in s.ij for s in structs)
)
Expand All @@ -97,7 +104,6 @@ def test_pair(self):
"""Create a pair for stack checking."""

seq = "ATGGAATAGTG"

self.assertEqual(_pair(seq, 0, 1, 9, 10), "AT/TG")

def test_stack(self):
Expand Down Expand Up @@ -169,7 +175,6 @@ def test_w(self):
v_cache.append([STRUCT_DEFAULT] * len(seq))
w_cache.append([STRUCT_DEFAULT] * len(seq))
struct = _w(seq, i, j, temp, v_cache, w_cache, RNA_ENERGIES)

self.assertAlmostEqual(struct.e, -3.8, delta=0.2)

seq = "CCUGCUUUGCACGCAGG"
Expand All @@ -182,7 +187,6 @@ def test_w(self):
v_cache.append([STRUCT_DEFAULT] * len(seq))
w_cache.append([STRUCT_DEFAULT] * len(seq))
struct = _w(seq, i, j, temp, v_cache, w_cache, RNA_ENERGIES)

self.assertAlmostEqual(struct.e, -6.4, delta=0.2)

seq = "GCGGUUCGAUCCCGC"
Expand Down

0 comments on commit 657bf1f

Please sign in to comment.