|
import numpy as np |
|
import scipy |
|
import scipy.spatial |
|
import string |
|
import os,re |
|
import random |
|
import util |
|
import gzip |
|
|
|
to1letter = { |
|
"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C', |
|
"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I', |
|
"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P', |
|
"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' } |
|
|
|
|
|
|
|
|
|
def parse_a3m(filename): |
|
|
|
msa = [] |
|
ins = [] |
|
|
|
table = str.maketrans(dict.fromkeys(string.ascii_lowercase)) |
|
|
|
|
|
|
|
if filename.split('.')[-1] == 'gz': |
|
fp = gzip.open(filename, 'rt') |
|
else: |
|
fp = open(filename, 'r') |
|
|
|
|
|
for line in fp: |
|
|
|
|
|
if line[0] == '>': |
|
continue |
|
|
|
|
|
line = line.rstrip() |
|
|
|
if len(line) == 0: |
|
continue |
|
|
|
|
|
msa.append(line.translate(table)) |
|
|
|
|
|
L = len(msa[-1]) |
|
|
|
|
|
a = np.array([0 if c.isupper() or c=='-' else 1 for c in line]) |
|
i = np.zeros((L)) |
|
|
|
if np.sum(a) > 0: |
|
|
|
pos = np.where(a==1)[0] |
|
|
|
|
|
a = pos - np.arange(pos.shape[0]) |
|
|
|
|
|
|
|
pos,num = np.unique(a, return_counts=True) |
|
|
|
|
|
i[pos] = num |
|
|
|
ins.append(i) |
|
if len(msa) == 10000: |
|
break |
|
|
|
|
|
alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8) |
|
msa = np.array([list(s) for s in msa], dtype='|S1').view(np.uint8) |
|
for i in range(alphabet.shape[0]): |
|
msa[msa == alphabet[i]] = i |
|
|
|
|
|
msa[msa > 20] = 20 |
|
|
|
ins = np.array(ins, dtype=np.uint8) |
|
|
|
return msa,ins |
|
|
|
|
|
|
|
|
|
def parse_pdb(filename): |
|
lines = open(filename,'r').readlines() |
|
return parse_pdb_lines(lines) |
|
|
|
|
|
def parse_pdb_lines(lines): |
|
|
|
|
|
idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] |
|
|
|
|
|
xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32) |
|
for l in lines: |
|
if l[:4] != "ATOM": |
|
continue |
|
resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20] |
|
idx = idx_s.index(resNo) |
|
for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): |
|
if tgtatm == atom: |
|
xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] |
|
break |
|
|
|
|
|
mask = np.logical_not(np.isnan(xyz[...,0])) |
|
xyz[np.isnan(xyz[...,0])] = 0.0 |
|
|
|
return xyz,mask,np.array(idx_s) |
|
|
|
|
|
''' |
|
def parse_pdb_lines(lines): |
|
|
|
# indices of residues observed in the structure |
|
#idx_s = [int(l[22:26]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] |
|
res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] |
|
idx_s = [int(r[0]) for r in res] |
|
seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res] |
|
|
|
# 4 BB + up to 10 SC atoms |
|
xyz = np.full((len(idx_s), 14, 3), np.nan, dtype=np.float32) |
|
for l in lines: |
|
if l[:4] != "ATOM": |
|
continue |
|
resNo, atom, aa = int(l[22:26]), l[12:16], l[17:20] |
|
idx = idx_s.index(resNo) |
|
for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): |
|
if tgtatm == atom: |
|
xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] |
|
break |
|
|
|
# save atom mask |
|
mask = np.logical_not(np.isnan(xyz[...,0])) |
|
xyz[np.isnan(xyz[...,0])] = 0.0 |
|
|
|
return xyz,mask,np.array(idx_s), np.array(seq) |
|
''' |
|
|
|
|
|
def parse_templates(item, params): |
|
|
|
|
|
|
|
|
|
ffdb = FFindexDB(read_index(params['FFDB']+'_pdb.ffindex'), |
|
read_data(params['FFDB']+'_pdb.ffdata')) |
|
|
|
|
|
|
|
|
|
infile = params['DIR']+'/hhr/'+item[-2:]+'/'+item+'.atab' |
|
hits = [] |
|
for l in open(infile, "r").readlines(): |
|
if l[0]=='>': |
|
key = l[1:].split()[0] |
|
hits.append([key,[],[]]) |
|
elif "score" in l or "dssp" in l: |
|
continue |
|
else: |
|
hi = l.split()[:5]+[0.0,0.0,0.0] |
|
hits[-1][1].append([int(hi[0]),int(hi[1])]) |
|
hits[-1][2].append([float(hi[2]),float(hi[3]),float(hi[4])]) |
|
|
|
|
|
|
|
|
|
|
|
lines = open(infile[:-4]+'hhr', "r").readlines() |
|
pos = [i+1 for i,l in enumerate(lines) if l[0]=='>'] |
|
for i,posi in enumerate(pos): |
|
hits[i].append([float(s) for s in re.sub('[=%]',' ',lines[posi]).split()[1::2]]) |
|
|
|
|
|
for hi in hits: |
|
|
|
|
|
entry = get_entry_by_name(hi[0], ffdb.index) |
|
if entry == None: |
|
continue |
|
data = read_entry_lines(entry, ffdb.data) |
|
hi += list(parse_pdb_lines(data)) |
|
|
|
|
|
counter = 0 |
|
xyz,qmap,mask,f0d,f1d,ids = [],[],[],[],[],[] |
|
for data in hits: |
|
if len(data)<7: |
|
continue |
|
|
|
qi,ti = np.array(data[1]).T |
|
_,sel1,sel2 = np.intersect1d(ti, data[6], return_indices=True) |
|
ncol = sel1.shape[0] |
|
if ncol < 10: |
|
continue |
|
|
|
ids.append(data[0]) |
|
f0d.append(data[3]) |
|
f1d.append(np.array(data[2])[sel1]) |
|
xyz.append(data[4][sel2]) |
|
mask.append(data[5][sel2]) |
|
qmap.append(np.stack([qi[sel1]-1,[counter]*ncol],axis=-1)) |
|
counter += 1 |
|
|
|
xyz = np.vstack(xyz).astype(np.float32) |
|
mask = np.vstack(mask).astype(np.bool) |
|
qmap = np.vstack(qmap).astype(np.long) |
|
f0d = np.vstack(f0d).astype(np.float32) |
|
f1d = np.vstack(f1d).astype(np.float32) |
|
ids = ids |
|
|
|
return xyz,mask,qmap,f0d,f1d,ids |
|
|