forked from bm2-lab/neoFusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ICB2_SNVindel.py
182 lines (154 loc) · 6.42 KB
/
ICB2_SNVindel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os,sys,re
import numpy as np
import subprocess
from multiprocessing import Process,Pool
import pickle
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.SeqIO.FastaIO import SimpleFastaParser
import math
from math import log, exp
from getNatureA import Neoantigen
import glob
import pickle
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from collections import defaultdict
import tempfile
hydro_score = dict(R=-4.5,K=-3.9,N=-3.5,D=-3.5,Q=-3.5,E=-3.5,H=-3.2,P=-1.6,Y=-1.3,W=-0.9,S=-.8,
T=-0.7,G=-0.4,A=1.8,M=1.9,C=2.5,F=2.8,L=3.8,V=4.2,I=4.5)
def getR(neo_seq,iedb_seq): ### nature文章的相似性
align_score = []
a = 26
k = 4.86936
for seq in iedb_seq:
aln_score = aligner(neo_seq,seq)
if aln_score:
localds_core = max([line[2] for line in aln_score])
align_score.append(localds_core)
bindingEnergies = list(map(lambda x: -k * (a - x), align_score))
lZk = logSum(bindingEnergies + [0])
lGb = logSum(bindingEnergies)
R=exp(lGb-lZk)
return R
def getH(pep,mymodel):
hydro_vector = do_hydro_vector(pep)
if len(hydro_vector) == 8:
H = .5
return H
else:
H = round(mymodel[len(hydro_vector)].predict_proba(hydro_vector)[0][1],3)
return H
def do_hydro_vector(pep):
hydro_vector = []
for i in pep:
hydro_vector.append(hydro_score[i.upper()])
return hydro_vector
def getmodel():
mymodel = {}
path = '/home/wzt/project/GeneFusion/dbGAP/results/'
model_9 = path + 'hg_xgb_9.dat'
model_10 = path + 'hg_xgb_10.dat'
model_11 = path + 'hg_xgb_11.dat'
with open(model_9, 'rb') as fin1, open(model_10, 'rb') as fin2, open(model_11, 'rb') as fin3:
mymodel[9] = pickle.load(fin1)
mymodel[10] = pickle.load(fin2)
mymodel[11] = pickle.load(fin3)
return mymodel
def getiedbseq():
iedb_seq = []
with open('/home/wzt/project/GeneFusion/iedb.fasta', 'r') as fin:
for t, seq in SimpleFastaParser(fin):
iedb_seq.append(seq)
return iedb_seq
def logSum(v):
ma = max(v)
return log(sum(map(lambda x: exp(x-ma),v))) + ma
def aligner(seq1,seq2):
matrix = matlist.blosum62
gap_open = -11
gap_extend = -1
aln = pairwise2.align.localds(seq1.upper(), seq2.upper(), matrix, gap_open, gap_extend)
return aln
def cal_similarity(mut_pep,wild_pep): ### 序列相似性
score_pair = aligner(mut_pep,wild_pep)[0][2]
score_self = aligner(mut_pep, mut_pep)[0][2]
score = score_pair / score_self
return round(score,3)
def cal_netctlpan(hla,pep):
with open('temp.fa', 'w') as fout:
fout.write('>pep\n{}\n'.format(pep))
cmd = 'python2 /home/zhouchi/software/netchop/predict.py --method netctlpan --length {} --allele {} ' \
'--epitope_threshold -99.9 --threshold -99.9 --noplot temp.fa > temp.tsv'.format(len(pep),hla)
subprocess.call(cmd,shell=True)
while True:
lines = subprocess.getoutput('grep {} temp.tsv'.format(pep)).strip().split()
if len(lines) == 0:
os.system(cmd)
else:
return lines[3],lines[4],lines[5]
def isfloat(x):
try:
return float(x)
except:
return 0
def main(Pt):
print (Pt)
filein = '/home/wzt/project/GeneFusion/Data2/{}/{}.mupexi'.format(Pt, Pt)
fileout = '/home/wzt/project/GeneFusion/Data2/{}/{}.filter.mupexi'.format(Pt, Pt)
with open(filein,'r') as fin,open(fileout,'w') as fout:
fout.write('HLA\tSample\tmismatch\t')
fout.write('MTpep\tMTpep_score\tMTpep_aff\tMTpep_rank\tMTpep_comb\t')
fout.write('WTpep\tWTpep_score\tWTpep_aff\tWTpep_rank\tWTpep_comb\t')
fout.write('Hydro_Model\tSelf_similar\tR\tA\n')
for line in fin:
if line.startswith('#') or line.startswith('HLA_allele'):
continue
lines = line.strip().split('\t')
if isfloat(lines[11]) <= 2 and isfloat(lines[26]) * isfloat(lines[16]) >= 1 and lines[25] == 'No':
hla = lines[0]
mtpep,mtpep_score, mtpep_aff, mtpep_rank = lines[7],lines[12],lines[10],lines[11]
wtpep,wtpep_score, wtpep_aff, wtpep_rank = lines[1], lines[6], lines[4], lines[5]
mtpep_tap, mtpep_cleavage, mtpep_comb = cal_netctlpan(hla, mtpep)
wtpep_tap, wtpep_cleavage, wtpep_comb = cal_netctlpan(hla, wtpep)
A = Neoantigen(mtpep, wtpep, mtpep_aff, wtpep_aff).getA()
R = getR(mtpep, iedb_seq)
H = getH(mtpep, mymodel)
self_similar = cal_similarity(mut_pep=mtpep, wild_pep=wtpep)
mismatch = lines[17]
out = '\t'.join((hla, Pt, str(mismatch), mtpep, mtpep_score, mtpep_aff, mtpep_rank, mtpep_comb,
wtpep, wtpep_score,wtpep_aff, wtpep_rank, wtpep_comb,str(H),str(self_similar),str(R),str(A)))
fout.write('{}\n'.format(out))
def main_check(Pt):
a = 0
filein = '/home/wzt/project/GeneFusion/Data2/{}/{}_9len.mupexi'.format(Pt, Pt)
with open(filein,'r') as fin:
for line in fin:
if line.startswith('#') or line.startswith('HLA_allele'):
continue
lines = line.strip().split('\t')
if isfloat(lines[11]) <= 2 and isfloat(lines[26]) * isfloat(lines[16]) >= 1 and lines[25] == 'No':
a += 1
print (Pt,a)
def domain(Pt):
try:
main(Pt)
except:
print (Pt)
SRRs = ['SRR3184279', 'SRR3184280', 'SRR3184281', 'SRR3184282', 'SRR3184283', 'SRR3184284', 'SRR3184285', 'SRR3184286',
'SRR3184287', 'SRR3184288', 'SRR3184289', 'SRR3184290', 'SRR3184291', 'SRR3184292', 'SRR3184293', 'SRR3184294',
'SRR3184295', 'SRR3184296', 'SRR3184297', 'SRR3184300', 'SRR3184301', 'SRR3184302', 'SRR3184303', 'SRR3184304',
'SRR3184305', 'SRR3184306']
Pts = ['Pt1', 'Pt2', 'Pt4', 'Pt5', 'Pt6', 'Pt7', 'Pt8', 'Pt9', 'Pt10', 'Pt12', 'Pt13', 'Pt14', 'Pt15', 'Pt16',
'Pt19', 'Pt20', 'Pt22', 'Pt23', 'Pt25', 'Pt28', 'Pt29', 'Pt31', 'Pt32', 'Pt35', 'Pt37', 'Pt38']
# Samples = []
# for Pt in Pts:
# filein = '/home/wzt/project/GeneFusion/Data2/{}/{}.mupexi'.format(Pt,Pt)
# fileout = '/home/wzt/project/GeneFusion/Data2/{}/{}.filter.mupexi'.format(Pt,Pt)
# if os.path.isfile(filein) and not os.path.isfile(fileout):
# Samples.append(Pt)
mymodel = getmodel()
iedb_seq = getiedbseq()
for Pt in Pts:
main(Pt)