# Copyright (c) 2014-2017 Matteo Degiacomi
#
# BiobOx is free software ;
# you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation ;
# either version 2 of the License, or (at your option) any later version.
# BiobOx is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY ;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with BiobOx ;
# if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
#
# Author : Matteo Degiacomi, matteothomas.degiacomi@gmail.com
import numpy as np
import pandas as pd
from biobox.classes.polyhedron import Polyhedron
from biobox.classes.molecule import Molecule
[docs]class Multimer(Polyhedron):
'''
Construct and manipulate a protein assembly composed of several :func:`Molecule <molecule.Molecule>` instances. Subclass of :func:`Polyhedron <polyhedron.Polyhedron>`.
'''
[docs] def query(self, query_text, get_index=False):
'''
## select specific atoms in a multimer un the basis of a text query.
:param query_text: string selecting atoms of interest. Uses the pandas query syntax, can access all columns in the dataframe self.data.
:param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data)
:returns: coordinates of the selected points (in a unique array) and, if get_index is set to true, a list of their indices in subunits' self.points array.
'''
idx = self.data.query(query_text).index.values
res = self.data.iloc[idx] #this is a new sliced dataframe
targets = np.array(res.loc[:, ["unit", "unit_index"]].values)
# append the coordinates of every unit within the query
pts = np.empty([0, 3])
for u in np.unique(targets[:, 0]):
pos = targets[targets[:, 0] == u, 1].astype(int)
this_unit = self.unit_labels[u]
pts = np.concatenate((pts, self.unit[this_unit].points[pos]))
if get_index:
return [pts, idx]
else:
return pts
[docs] def atomselect(self, u, chain, resid, atom, get_index=False, use_resname=False):
'''
## select specific atoms in a multimer providing unit, chain, residue ID and atom name.
:param u: number of desired unit to select in the multimer
:param chain: selection of a specific chain name (accepts '*' as wildcard). Can also be a list or numpy array of strings.
:param resid: residue ID of desired atoms (accepts '*' as wildcard). Can also be a list or numpy array of of int.
:param atom: name of desired atom (accepts '*' as wildcard). Can also be a list or numpy array of strings.
:param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data)
:param use_resname: if set to True, consider information in "res" variable as resnames, and not resids
:returns: coordinates of the selected points (in a unique array) and, if get_index is set to true, a list of their indices in subunits' self.points array.
'''
# extract id of units of interest
if u == '*':
unit_id = list(self.unit_labels.values())
else:
if isinstance(u, str) or isinstance(u, int):
try:
unit_id = [self.unit_labels[str(u)]]
except Exception as ex:
raise Exception("ERROR: unit %s not found!" % u)
elif isinstance(u, list) or type(u).__module__ == 'numpy':
unit_id = []
for c in range(0, len(u), 1):
try:
unit_id.append(self.unit_labels[str(u[c])])
except Exception as ex:
raise Exception("ERROR: unit %s not found!" % u[c])
else:
raise Exception("ERROR: wrong type for unit selection. Should be str, int, list, or numpy")
# initialize storage for indices and coordinates
indices = []
pts = np.empty([0, 3])
for i in range(0, len(self.unit), 1):
if i in unit_id:
[pts_tmp, index_tmp] = self.unit[i].atomselect(chain, resid, atom, True, use_resname=use_resname)
pts = np.concatenate((pts, pts_tmp))
indices.append(index_tmp)
else:
# indices of all units must be stored. If unit is not
# requested, return an empty array for it
indices.append([])
if get_index:
return [pts, indices]
else:
return pts
[docs] def make_molecule(self):
'''
Return a :func:`Molecule <molecule.Molecule>` object containing all the points of the assembly. Chain will indicate different units, original chain value is pushed in segment entry.
:returns: :func:`Molecule <molecule.Molecule>` object
'''
# create new data entry (renumber indices, reassign chain name)
data = np.empty([0, 9])
atom_ccs = {}
r = []
c = []
skipcharge = False
for i in range(0, len(self.unit), 1):
data_tmp = self.unit[i].data[[
"atom", "index", "name", "resname", "chain",
"resid", "beta", "occupancy", "atomtype"]].values
data_tmp[:, 4] = self.chain_names[i]
data = np.concatenate((data, data_tmp))
# merge knowledge about CCS acquired by different molecules
atom_ccs = {}
for k in self.unit[i].knowledge['atom_ccs'].keys():
atom_ccs[k] = self.unit[i].knowledge['atom_ccs'][k]
if len(r) == 0:
r = self.unit[i].data['radius']
else:
r = np.concatenate((r, self.unit[i].data['radius']))
try:
if len(c) == 0:
c = self.unit[i].data['charge']
else:
c = np.concatenate((c, self.unit[i].data['charge']))
except Exception as ex:
skipcharge = True
continue
data[:, 1] = np.linspace(1, len(data), len(data)).astype(int)
cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"]
idx = np.arange(len(data))
# create molecule, and push created data information
M = Molecule()
M.add_xyz(self.get_all_xyz())
M.data = pd.DataFrame(data, index=idx, columns=cols)
M.properties['center'] = M.get_center()
M.knowledge['atom_ccs'] = atom_ccs
M.data['radius'] = r
if not skipcharge:
M.data['charge'] = c
return M
#def rmsd(self, ref_index, u="*", chain="*", resid="*", atom="*", align=False):
# '''
# Calculate the RMSD between atoms of interest in all structure with respect of a reference structure.
# supposes that all multimer subunits contain the same amount of alternative coordiantes.
# These are considered as representations of monomers conformations in a possible multimer.
# :param u: number of desired unit to select in the multimer
# :param chain: selection of a specific chain name (accepts '*' as wildcard). Can also be a list or numpy array of strings.
# :param resid: residue ID of desired atoms (accepts '*' as wildcard). Can also be a list or numpy array of of int.
# :param atom: name of desired atom (accepts '*' as wildcard). Can also be a list or numpy array of strings.
# :param ref_index: index of reference structure in conformations database
# :param align: if True, structures will all be aligned (cannot be undone)
# :returns: RMSD of all structures with respect of reference structure (in a numpy array)
# '''
#
# if ref_index >= self.unit[0].coordinates.shape[0]:
# raise Exception("ERROR: requested frame %s as reference, but only %s frames are available!" %(ref_index, self.unit[0].coordinates.shape[0]))
# # select indices of atoms of interest and call overloaded method
# indices = self.atomselect(u, chain, resid, atom, get_index=True)[1]
# return super(Multimer, self).rmsd(ref_index, points_indices=indices, align=align)
[docs] def get_data(self, indices, columns):
'''
Return information about atom of interest (i.e., slice the data DataFrame)
:param indices: list of indices
:param columns: list of columns (e.g. ["resname", "resid", "chain"])
:returns: slice of molecule's data DataFrame
'''
return self.data.loc[indices, columns].values
[docs] def write_pdb(self, outname):
'''
Write a pdb of the multimeric assembly.
:param outname: name of PDB file to generate
'''
f_out = open(outname, "w")
for f in range(len(self.unit[0].coordinates)):
cnt = 1
# set current state to new frame
for j in range(0, len(self.unit), 1):
self.unit[j].set_current(f)
for j in range(0, len(self.unit), 1):
# get data about points and their properties from the desired
# protein structure
d = self.unit[j].get_pdb_data()
for i in range(0, len(self.unit[j].points), 1):
# create and write PDB lin
if d[i][2][0].isdigit():
L = '%-6s%5i %-5s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], int(d[i][5]), float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11])
else:
L = '%-6s%5i %-4s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], int(d[i][5]), float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11])
#L='%-6s%5i %-4s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n'%(d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], d[i][5], d[i][6], d[i][7], d[i][8], d[i][9], d[i][10], d[i][11])
f_out.write(L)
cnt += 1
f_out.write("TER\n")
f_out.write("END\n")
f_out.close()