From 657bf1f6c3fdb65bb9ad68d3aa0e59ce8a61ce75 Mon Sep 17 00:00:00 2001 From: JJTimmons Date: Wed, 22 Jan 2020 14:07:32 -0500 Subject: [PATCH] Add fold_cache for cache output - having a cache to check combinations of i,j input is useful in the context of creating primers for PCR --- Makefile | 2 +- examples/dna.csv | 8 ++-- seqfold/__init__.py | 2 +- seqfold/fold.py | 96 ++++++++++++++++++++++++++---------------- tests/fold_examples.py | 12 +++--- tests/fold_test.py | 24 ++++++----- 6 files changed, 85 insertions(+), 59 deletions(-) diff --git a/Makefile b/Makefile index ffbc082..1215c68 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ minor: test python3 setup.py sdist bdist_wheel python3 -m twine upload dist/* --skip-existing -examples: +examples: install python3 ./tests/fold_examples.py profile: install diff --git a/examples/dna.csv b/examples/dna.csv index 773c7f6..aec233d 100644 --- a/examples/dna.csv +++ b/examples/dna.csv @@ -1,9 +1,9 @@ seqfold,UNAFold,seq --15.2,-10.94,GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC +-15.2,-10.9,GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC -24.8,-23.4,GGGAGGTCGCTCCAGCTGGGAGGAGCGTTGGGGGTATATACCCCCAACACCGGTACTGATCCGGTGACCTCCC -7.5,-3.8,CGCAGGGAUACCCGCG --3.5,-6.85,TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT +-3.5,-6.9,TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT -18.0,-15.5,GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA -19.0,-18.1,TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA --3.0,-3.65,ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA --9.2,-9.35,TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC +-3.0,-3.7,ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA +-9.2,-9.4,TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC diff --git a/seqfold/__init__.py b/seqfold/__init__.py index 36059cd..510b9a8 100644 --- a/seqfold/__init__.py +++ b/seqfold/__init__.py @@ -10,4 +10,4 @@ finally: del get_distribution, DistributionNotFound -from .fold import calc_dg, fold, Cache, Struct +from .fold import fold, fold_cache, calc_dg, traceback, Cache, Struct diff --git a/seqfold/fold.py b/seqfold/fold.py index ac6ea7d..e7e61d8 100644 --- a/seqfold/fold.py +++ b/seqfold/fold.py @@ -44,21 +44,50 @@ def with_ij(self, ij: List[Tuple[int, int]]): """A map from i, j tuple to a value.""" -def calc_dg(seq: str, temp: float = 37.0) -> float: - """Fold the sequence and return just the delta G of the structure +def fold_cache(seq: str, temp: float = 37.0) -> Tuple[Cache, Cache]: + """Fold a nucleic acid sequence and return the w_cache. + + The Cache is useful for gathering many possible energies + between a series of (i,j) combinations. Args: seq: The sequence to fold Keyword Args: temp: The temperature to fold at - + Returns: - float: The minimum free energy of the folded sequence + (Cache, Cache): The w_cache and the v_cache for traversal later """ - structs = fold(seq, temp) - return round(sum(s.e for s in structs), 2) + seq = seq.upper() + temp = temp + 273.15 # kelvin + + # figure out whether it's DNA or RNA, choose energy map + dna = True + bps = set(seq) + if "U" in bps and "T" in bps: + raise RuntimeError( + "Both T and U in sequence. Provide one or the other for DNA OR RNA." + ) + if all(bp in "AUCG" for bp in bps): + dna = False + elif any(bp not in "ATGC" for bp in bps): + diff = [bp for bp in bps if bp not in "ATUGC"] + raise RuntimeError(f"Unknown bp: {diff}. Only DNA/RNA foldable") + emap = DNA_ENERGIES if dna else RNA_ENERGIES + + n = len(seq) + v_cache: Cache = [] + w_cache: Cache = [] + for _ in range(n): + v_cache.append([STRUCT_DEFAULT] * n) + w_cache.append([STRUCT_DEFAULT] * n) + + # fill the cache + _w(seq, 0, n - 1, temp, v_cache, w_cache, emap) + + return v_cache, w_cache def fold(seq: str, temp: float = 37.0) -> List[Struct]: @@ -83,35 +112,28 @@ def fold(seq: str, temp: float = 37.0) -> List[Struct]: List[Struct]: A list of structures. Stacks, bulges, hairpins, etc. """ - seq = seq.upper() - temp = temp + 273.15 # kelvin + v_cache, w_cache = fold_cache(seq, temp) + n = len(seq) - # figure out whether it's DNA or RNA, choose energy map - dna = True - bps = set(seq) - if "U" in bps and "T" in bps: - raise RuntimeError( - "Both T and U in sequence. Provide one or the other for DNA OR RNA." - ) - if all(bp in "AUCG" for bp in bps): - dna = False - elif any(bp not in "ATGC" for bp in bps): - diff = [bp for bp in bps if bp not in "ATUGC"] - raise RuntimeError(f"Unknown bp: {diff}. Only DNA/RNA foldable") - emap = DNA_ENERGIES if dna else RNA_ENERGIES + # get the minimum free energy structure out of the cache + return traceback(0, n - 1, v_cache, w_cache) - n = len(seq) - v_cache: Cache = [] - w_cache: Cache = [] - for _ in range(n): - v_cache.append([STRUCT_DEFAULT] * n) - w_cache.append([STRUCT_DEFAULT] * n) - # fill the cache - _w(seq, 0, n - 1, temp, v_cache, w_cache, emap) +def calc_dg(seq: str, temp: float = 37.0) -> float: + """Fold the sequence and return just the delta G of the structure + + Args: + seq: The sequence to fold + + Keyword Args: + temp: The temperature to fold at - # get the structure out of the cache - return _traceback(0, n - 1, v_cache, w_cache) + Returns: + float: The minimum free energy of the folded sequence + """ + + structs = fold(seq, temp) + return round(sum(s.e for s in structs), 2) def _w( @@ -738,7 +760,7 @@ def add_branch(s: Struct): return Struct(e, f"BIFURCATION:{str(unpaired)}n/{str(branches_count)}h", branches) -def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]: +def traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]: """Traceback thru the V(i,j) and W(i,j) caches to find the structure For each step, get to the lowest energy W(i,j) within that block @@ -750,7 +772,7 @@ def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]: Args: i: The leftmost index to start searching in j: The rightmost index to start searching in - v_cache: Energies/structures where i and j bond + v_cache: Energies where i and j bond w_cache: Energies/sub-structures between or with i and j Returns: @@ -786,11 +808,11 @@ def _traceback(i: int, j: int, v_cache: Cache, w_cache: Cache) -> List[Struct]: structs = _trackback_energy(structs) branches: List[Struct] = [] for i1, j1 in s.ij: - traceback = _traceback(i1, j1, v_cache, w_cache) - if traceback and traceback[0].ij: - i2, j2 = traceback[0].ij[0] + tb = traceback(i1, j1, v_cache, w_cache) + if tb and tb[0].ij: + i2, j2 = tb[0].ij[0] e_sum += w_cache[i2][j2].e - branches += traceback + branches += tb last = structs[-1] structs[-1] = Struct(round(last.e - e_sum, 1), last.desc, list(last.ij)) diff --git a/tests/fold_examples.py b/tests/fold_examples.py index 8daea44..49c2027 100644 --- a/tests/fold_examples.py +++ b/tests/fold_examples.py @@ -9,14 +9,14 @@ DIR = path.dirname(path.realpath(__file__)) DNA_EXAMPLES = { - "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC": -10.94, # three branched structure + "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC": -10.9, # three branched structure "GGGAGGTCGCTCCAGCTGGGAGGAGCGTTGGGGGTATATACCCCCAACACCGGTACTGATCCGGTGACCTCCC": -23.4, # four branched structure "CGCAGGGAUACCCGCG": -3.8, - "TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT": -6.85, - "GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA": -15.50, - "TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA": -18.10, - "ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA": -3.65, - "TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC": -9.35, + "TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT": -6.9, + "GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA": -15.5, + "TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA": -18.1, + "ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA": -3.7, + "TGTCAGAAGTTTCCAAATGGCCAGCAATCAACCCATTCCATTGGGGATACAATGGTACAGTTTCGCATATTGTCGGTGAAAATGGTTCCATTAAACTCC": -9.4, } # writing results to examples for comparison diff --git a/tests/fold_test.py b/tests/fold_test.py index a05d828..27f15e0 100644 --- a/tests/fold_test.py +++ b/tests/fold_test.py @@ -4,20 +4,17 @@ from time import time import unittest -from seqfold import calc_dg +from seqfold import calc_dg, fold, fold_cache, traceback, Cache, Struct from seqfold.dna import DNA_ENERGIES from seqfold.rna import RNA_ENERGIES from seqfold.fold import ( - fold, STRUCT_DEFAULT, - Cache, _bulge, _stack, _hairpin, _internal_loop, _pair, _w, - _traceback, ) @@ -25,9 +22,9 @@ class TestFold(unittest.TestCase): """Test folding functions""" def test_fold(self): - """Test fold function.""" + """Fold function.""" - # it should throw if a non-sense sequence is provided + # it should throw if a nonsense sequence is provided with self.assertRaises(RuntimeError): calc_dg("EASFEASFAST", 37.0) @@ -38,6 +35,17 @@ def test_fold(self): # should not throw calc_dg("ATGGATTTAGATAGAT") + def test_fold_cache(self): + """Gather a cache of the folded structure.""" + + seq = "ATGGATTTAGATAGAT" + v_cache, w_cache = fold_cache(seq) + seq_dg = calc_dg(seq) + + self.assertEqual( + seq_dg, sum(s.e for s in traceback(0, len(seq) - 1, v_cache, w_cache)) + ) + def test_fold_dna(self): """DNA folding to find min energy secondary structure.""" @@ -88,7 +96,6 @@ def test_multibranch(self): seq = "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC" # three branch structs = fold(seq) - self.assertTrue( any("BIFURCATION" in s.desc and (7, 41) in s.ij for s in structs) ) @@ -97,7 +104,6 @@ def test_pair(self): """Create a pair for stack checking.""" seq = "ATGGAATAGTG" - self.assertEqual(_pair(seq, 0, 1, 9, 10), "AT/TG") def test_stack(self): @@ -169,7 +175,6 @@ def test_w(self): v_cache.append([STRUCT_DEFAULT] * len(seq)) w_cache.append([STRUCT_DEFAULT] * len(seq)) struct = _w(seq, i, j, temp, v_cache, w_cache, RNA_ENERGIES) - self.assertAlmostEqual(struct.e, -3.8, delta=0.2) seq = "CCUGCUUUGCACGCAGG" @@ -182,7 +187,6 @@ def test_w(self): v_cache.append([STRUCT_DEFAULT] * len(seq)) w_cache.append([STRUCT_DEFAULT] * len(seq)) struct = _w(seq, i, j, temp, v_cache, w_cache, RNA_ENERGIES) - self.assertAlmostEqual(struct.e, -6.4, delta=0.2) seq = "GCGGUUCGAUCCCGC"