|
import numpy as np |
|
import scipy |
|
import scipy.spatial |
|
import string |
|
import os,re |
|
import random |
|
import util |
|
|
|
|
|
to1letter = { |
|
"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C', |
|
"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I', |
|
"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P', |
|
"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' } |
|
|
|
|
|
def parse_a3m(filename): |
|
'''read A3M and convert letters into integers in the 0..20 range, |
|
also keep track of insertions |
|
''' |
|
|
|
|
|
lab,seq = [],[] |
|
for line in open(filename, "r"): |
|
if line[0] == '>': |
|
lab.append(line.split()[0][1:]) |
|
seq.append("") |
|
else: |
|
seq[-1] += line.rstrip() |
|
|
|
|
|
msa,ins = [],[] |
|
table = str.maketrans(dict.fromkeys(string.ascii_lowercase)) |
|
nrow,ncol = len(seq),len(seq[0]) |
|
|
|
for seqi in seq: |
|
|
|
|
|
msa.append(seqi.translate(table)) |
|
|
|
|
|
a = np.array([0 if c.isupper() or c=='-' else 1 for c in seqi]) |
|
i = np.zeros((ncol)) |
|
|
|
if np.sum(a) > 0: |
|
|
|
pos = np.where(a==1)[0] |
|
|
|
|
|
a = pos - np.arange(pos.shape[0]) |
|
|
|
|
|
|
|
pos,num = np.unique(a, return_counts=True) |
|
[pos[pos<ncol]] = num[pos<ncol] |
|
|
|
|
|
ins.append(i) |
|
|
|
|
|
alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8) |
|
msa = np.array([list(s) for s in msa], dtype='|S1').view(np.uint8) |
|
for i in range(alphabet.shape[0]): |
|
msa[msa == alphabet[i]] = i |
|
|
|
|
|
msa[msa > 20] = 20 |
|
|
|
ins = np.array(ins, dtype=np.uint8) |
|
|
|
return {"msa":msa, "labels":lab, "insertions":ins} |
|
|
|
|
|
def parse_pdb(filename, **kwargs): |
|
'''extract xyz coords for all heavy atoms''' |
|
lines = open(filename,'r').readlines() |
|
return parse_pdb_lines(lines, **kwargs) |
|
|
|
def parse_pdb_lines(lines, parse_hetatom=False, ignore_het_h=True): |
|
|
|
res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] |
|
seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res] |
|
pdb_idx = [( l[21:22].strip(), int(l[22:26].strip()) ) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] |
|
|
|
|
|
xyz = np.full((len(res), 27, 3), np.nan, dtype=np.float32) |
|
for l in lines: |
|
if l[:4] != "ATOM": |
|
continue |
|
chain, resNo, atom, aa = l[21:22], int(l[22:26]), ' '+l[12:16].strip().ljust(3), l[17:20] |
|
idx = pdb_idx.index((chain,resNo)) |
|
for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): |
|
if tgtatm is not None and tgtatm.strip() == atom.strip(): |
|
xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] |
|
break |
|
|
|
|
|
mask = np.logical_not(np.isnan(xyz[...,0])) |
|
xyz[np.isnan(xyz[...,0])] = 0.0 |
|
|
|
|
|
new_idx = [] |
|
i_unique = [] |
|
for i,idx in enumerate(pdb_idx): |
|
if idx not in new_idx: |
|
new_idx.append(idx) |
|
i_unique.append(i) |
|
|
|
pdb_idx = new_idx |
|
xyz = xyz[i_unique] |
|
mask = mask[i_unique] |
|
seq = np.array(seq)[i_unique] |
|
|
|
out = {'xyz':xyz, |
|
'mask':mask, |
|
'idx':np.array([i[1] for i in pdb_idx]), |
|
'seq':np.array(seq), |
|
'pdb_idx': pdb_idx, |
|
} |
|
|
|
|
|
if parse_hetatom: |
|
xyz_het, info_het = [], [] |
|
for l in lines: |
|
if l[:6]=='HETATM' and not (ignore_het_h and l[77]=='H'): |
|
info_het.append(dict( |
|
idx=int(l[7:11]), |
|
atom_id=l[12:16], |
|
atom_type=l[77], |
|
name=l[16:20] |
|
)) |
|
xyz_het.append([float(l[30:38]), float(l[38:46]), float(l[46:54])]) |
|
|
|
out['xyz_het'] = np.array(xyz_het) |
|
out['info_het'] = info_het |
|
|
|
return out |
|
|
|
def parse_fasta(filename): |
|
''' |
|
Return dict of name: seq |
|
''' |
|
out = {} |
|
with open(filename, 'r') as f_in: |
|
while True: |
|
name = f_in.readline().strip()[1:] |
|
seq = f_in.readline().strip() |
|
if not name: break |
|
|
|
out[name] = seq |
|
|
|
return out |
|
|