Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add several functions to work with refGene exon sequences #13

Merged
merged 10 commits into from
Aug 29, 2018
Prev Previous commit
Next Next commit
Replace varity.util/revcomp-bases with cljam.util.sequence/revcomp
  • Loading branch information
alumi committed Aug 24, 2018
commit 70113dd88a7f82a27d5c2443ad36139c157930e4
16 changes: 8 additions & 8 deletions src/varity/hgvs_to_vcf/cdna.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
(ns varity.hgvs-to-vcf.cdna
(:require clj-hgvs.mutation
[cljam.io.sequence :as cseq]
[varity.ref-gene :as rg]
[varity.util :refer [revcomp-bases]]))
[cljam.util.sequence :as util-seq]
[varity.ref-gene :as rg]))

;; AGT => ["AGT"]
;; AGTAGT => ["AGT" "AGTAGT"]
Expand All @@ -23,9 +23,9 @@
{:chr chr
:pos pos
:ref (cond-> (:ref mut*)
(= strand "-") (revcomp-bases))
(= strand "-") (util-seq/revcomp))
:alt (cond-> (:alt mut*)
(= strand "-") (revcomp-bases))}))
(= strand "-") (util-seq/revcomp))}))

(defmethod vcf-variant clj_hgvs.mutation.DNADeletion
[mut* seq-rdr {:keys [chr strand] :as rg}]
Expand Down Expand Up @@ -79,7 +79,7 @@
:pos start
:ref ref
:alt (str ref (cond-> (:alt mut*)
(= strand "-") (revcomp-bases)))})))
(= strand "-") (util-seq/revcomp)))})))

(defmethod vcf-variant clj_hgvs.mutation.DNAInversion
[mut* seq-rdr {:keys [chr strand] :as rg}]
Expand All @@ -96,7 +96,7 @@
{:chr chr
:pos (dec start)
:ref ref
:alt (str (first ref) (revcomp-bases (subs ref 1)))}))))
:alt (str (first ref) (util-seq/revcomp (subs ref 1)))}))))

(defmethod vcf-variant clj_hgvs.mutation.DNAIndel
[mut* seq-rdr {:keys [chr strand] :as rg}]
Expand All @@ -113,12 +113,12 @@
(let [ref (cseq/read-sequence seq-rdr {:chr chr, :start (dec start), :end end})]
(if (or (nil? (:ref mut*))
(= (subs ref 1) (cond-> (:ref mut*)
(= strand "-") (revcomp-bases))))
(= strand "-") (util-seq/revcomp))))
{:chr chr
:pos (dec start)
:ref ref
:alt (str (first ref) (cond-> (:alt mut*)
(= strand "-") (revcomp-bases)))})))))
(= strand "-") (util-seq/revcomp)))})))))

(defmethod vcf-variant clj_hgvs.mutation.DNARepeatedSeqs
[mut* seq-rdr {:keys [chr strand] :as rg}]
Expand Down
16 changes: 8 additions & 8 deletions src/varity/hgvs_to_vcf/protein.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
(:require [clj-hgvs.core :as hgvs]
[clj-hgvs.mutation :as mut]
[cljam.io.sequence :as cseq]
[cljam.util.sequence :as util-seq]
[varity.codon :as codon]
[varity.ref-gene :as rg]
[varity.util :refer [revcomp-bases]])
[varity.ref-gene :as rg])
(:import [clj_hgvs.mutation ProteinSubstitution]))

(defn- pos-candidates
Expand All @@ -28,7 +28,7 @@
(let [ref-codon1 (->> pos-cands
(map #(cseq/read-sequence seq-rdr {:chr chr, :start %, :end %}))
(apply str))
ref-codon (cond-> ref-codon1 (= strand "-") revcomp-bases)
ref-codon (cond-> ref-codon1 (= strand "-") util-seq/revcomp)
palt (mut/->short-amino-acid (:alt mut*))
codon-cands (codon/amino-acid->codons palt)]
(->> codon-cands
Expand All @@ -40,9 +40,9 @@
{:chr chr
:pos pos
:ref (cond-> (str (nth ref-codon idx))
(= strand "-") revcomp-bases)
(= strand "-") util-seq/revcomp)
:alt (cond-> (str (nth codon* idx))
(= strand "-") revcomp-bases)})))
(= strand "-") util-seq/revcomp)})))
(remove nil?))))
(flatten)))))
(throw (IllegalArgumentException. "Unsupported mutation"))))
Expand All @@ -59,7 +59,7 @@
(let [ref-codon1 (->> pos-cands
(map #(cseq/read-sequence seq-rdr {:chr chr, :start %, :end %}))
(apply str))
ref-codon (cond-> ref-codon1 (= strand "-") revcomp-bases)
ref-codon (cond-> ref-codon1 (= strand "-") util-seq/revcomp)
palt (mut/->short-amino-acid (:alt mut*))
codon-cands (codon/amino-acid->codons palt)
pos-cands (cond-> pos-cands (= strand "-") reverse)]
Expand All @@ -73,8 +73,8 @@
alt (str (nth codon* idx))]
{:vcf {:chr chr
:pos pos
:ref (cond-> ref (= strand "-") revcomp-bases)
:alt (cond-> alt (= strand "-") revcomp-bases)}
:ref (cond-> ref (= strand "-") util-seq/revcomp)
:alt (cond-> alt (= strand "-") util-seq/revcomp)}
:cdna (hgvs/hgvs (:name rg) :cdna (mut/dna-substitution (rg/cds-coord pos rg)
ref
(if (= ref alt) "=" ">")
Expand Down
3 changes: 2 additions & 1 deletion src/varity/ref_gene.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
[clj-hgvs.coordinate :as coord]
[cljam.io.sequence :as cseq]
[cljam.util.chromosome :refer [normalize-chromosome-key]]
[cljam.util.sequence :as util-seq]
[proton.core :refer [as-long]]
[varity.util :as util]))

Expand Down Expand Up @@ -186,7 +187,7 @@
"Reads a base sequence of an `exon` from `seq-rdr`."
[seq-rdr {:keys [reverse?] :as exon}]
(cond-> (cseq/read-sequence seq-rdr exon)
reverse? util/revcomp-bases))
reverse? util-seq/revcomp))

(defn ^String read-gene-sequence
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The gene word contains more widely meanings. Thus I think you should use transcript here.

Additionally, could you add docstrings to clarify the function, such as the sequence contains 5'-UTR, CDS, and 3'-UTR.

"Reads a DNA base sequence of a ref-gene record `gene` from `seq-rdr`."
Expand Down
9 changes: 1 addition & 8 deletions src/varity/util.clj
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
(ns varity.util
"Utilities."
(:require [clojure.java.io :as io]
[cljam.util.sequence :as util-seq])
(:require [clojure.java.io :as io])
(:import [java.io InputStream]
[org.apache.commons.compress.compressors
CompressorStreamFactory CompressorException]))

(defn revcomp-bases
"Returns reverse complementary bases string."
[s]
{:pre [(string? s)]}
(util-seq/revcomp s))

(defn ^InputStream compressor-input-stream
"Returns an compressor input stream from f, autodetecting the compressor type
from the first few bytes of f. Returns java.io.BufferedInputStream if the
Expand Down
22 changes: 11 additions & 11 deletions src/varity/vcf_to_hgvs/cdna.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
(ns varity.vcf-to-hgvs.cdna
(:require [clj-hgvs.core :as hgvs]
[clj-hgvs.mutation :as mut]
[cljam.util.sequence :as util-seq]
[varity.ref-gene :as rg]
[varity.util :refer [revcomp-bases]]
[varity.vcf-to-hgvs.common :refer [diff-bases] :as common]))

(defn- repeat-info-forward
Expand Down Expand Up @@ -33,9 +33,9 @@
100)
(map (fn [seq*]
(let [nseq* (count seq*)]
(if-let [[unit ref-repeat :as ri] (common/repeat-info (revcomp-bases seq*)
(if-let [[unit ref-repeat :as ri] (common/repeat-info (util-seq/revcomp seq*)
(inc nseq*)
(revcomp-bases ins))]
(util-seq/revcomp ins))]
(let [nunit (count unit)]
(if (> (* nunit ref-repeat) (- nseq* nunit))
false
Expand All @@ -58,7 +58,7 @@
[unit ref-repeat ins-repeat] (repeat-info* seq-rdr rg (+ pos offset) alt-only)]
(cond
(and (= nrefo 1) (= nalto 1)) :substitution
(= ref-only (revcomp-bases alt-only)) :inversion
(= ref-only (util-seq/revcomp alt-only)) :inversion
(and (pos? nrefo) (zero? nalto)) :deletion
(and (pos? nrefo) (pos? nalto)) :indel
(some? unit) (cond
Expand All @@ -72,9 +72,9 @@
[rg pos ref alt]
(let [{:keys [strand]} rg]
(mut/dna-substitution (rg/cds-coord pos rg)
(cond-> ref (= strand "-") revcomp-bases)
(cond-> ref (= strand "-") util-seq/revcomp)
(if (= ref alt) "=" ">")
(cond-> alt (= strand "-") revcomp-bases))))
(cond-> alt (= strand "-") util-seq/revcomp))))

(defn- dna-deletion
[rg pos ref alt]
Expand All @@ -92,7 +92,7 @@
"+" right
"-" left)
rg))
(cond-> (subs ref offset) (= strand "-") revcomp-bases))))
(cond-> (subs ref offset) (= strand "-") util-seq/revcomp))))

(defn- dna-duplication
[rg pos ref alt]
Expand All @@ -106,7 +106,7 @@
"-" (inc (- start (count ins))))]
(mut/dna-duplication (rg/cds-coord start rg)
(rg/cds-coord end rg)
(cond-> ins (= strand "-") revcomp-bases))))
(cond-> ins (= strand "-") util-seq/revcomp))))

(defn- dna-insertion
[rg pos ref alt]
Expand All @@ -116,7 +116,7 @@
end (cond-> (+ pos offset) (= strand "-") dec)]
(mut/dna-insertion (rg/cds-coord start rg)
(rg/cds-coord end rg)
(cond-> ins (= strand "-") revcomp-bases))))
(cond-> ins (= strand "-") util-seq/revcomp))))

(defn- dna-inversion
[rg pos ref alt]
Expand Down Expand Up @@ -153,8 +153,8 @@
"+" right
"-" left)
rg))
(cond-> del (= strand "-") revcomp-bases)
(cond-> ins (= strand "-") revcomp-bases))))
(cond-> del (= strand "-") util-seq/revcomp)
(cond-> ins (= strand "-") util-seq/revcomp))))

(defn- dna-repeated-seqs
[seq-rdr rg pos ref alt]
Expand Down
8 changes: 4 additions & 4 deletions src/varity/vcf_to_hgvs/protein.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
[clj-hgvs.core :as hgvs]
[clj-hgvs.mutation :as mut]
[cljam.io.sequence :as cseq]
[cljam.util.sequence :as util-seq]
[varity.codon :as codon]
[varity.ref-gene :as rg]
[varity.util :refer [revcomp-bases]]
[varity.vcf-to-hgvs.common :refer [diff-bases] :as common]))

(defn- split-string-at [s x]
Expand Down Expand Up @@ -107,13 +107,13 @@
alt-exon-seq1 (exon-sequence alt-seq cds-start alt-exon-ranges*)]
{:ref-exon-seq ref-exon-seq1
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq1
(= strand "-") revcomp-bases))
(= strand "-") util-seq/revcomp))
:alt-exon-seq alt-exon-seq1
:alt-prot-seq (codon/amino-acid-sequence (cond-> alt-exon-seq1
(= strand "-") revcomp-bases))
(= strand "-") util-seq/revcomp))
:alt-tx-prot-seq (codon/amino-acid-sequence
(cond-> (str ref-up-exon-seq1 alt-exon-seq1 ref-down-exon-seq1)
(= strand "-") revcomp-bases))
(= strand "-") util-seq/revcomp))
:ini-offset (quot (:position (rg/cds-coord (case strand
"+" tx-start
"-" tx-end) rg))
Expand Down
12 changes: 0 additions & 12 deletions test/varity/util_test.clj

This file was deleted.