Source code for multimer

# Copyright (c) 2014-2017 Matteo Degiacomi
#
# BiobOx is free software ;
# you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation ;
# either version 2 of the License, or (at your option) any later version.
# BiobOx is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY ;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with BiobOx ;
# if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
#
# Author : Matteo Degiacomi, matteothomas.degiacomi@gmail.com

import numpy as np
import pandas as pd

from biobox.classes.polyhedron import Polyhedron
from biobox.classes.molecule import Molecule


[docs]class Multimer(Polyhedron): ''' Construct and manipulate a protein assembly composed of several :func:`Molecule <molecule.Molecule>` instances. Subclass of :func:`Polyhedron <polyhedron.Polyhedron>`. '''
[docs] def query(self, query_text, get_index=False): ''' ## select specific atoms in a multimer un the basis of a text query. :param query_text: string selecting atoms of interest. Uses the pandas query syntax, can access all columns in the dataframe self.data. :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :returns: coordinates of the selected points (in a unique array) and, if get_index is set to true, a list of their indices in subunits' self.points array. ''' idx = self.data.query(query_text).index.values res = self.data.iloc[idx] #this is a new sliced dataframe targets = np.array(res.loc[:, ["unit", "unit_index"]].values) # append the coordinates of every unit within the query pts = np.empty([0, 3]) for u in np.unique(targets[:, 0]): pos = targets[targets[:, 0] == u, 1].astype(int) this_unit = self.unit_labels[u] pts = np.concatenate((pts, self.unit[this_unit].points[pos])) if get_index: return [pts, idx] else: return pts
[docs] def atomselect(self, u, chain, resid, atom, get_index=False, use_resname=False): ''' ## select specific atoms in a multimer providing unit, chain, residue ID and atom name. :param u: number of desired unit to select in the multimer :param chain: selection of a specific chain name (accepts '*' as wildcard). Can also be a list or numpy array of strings. :param resid: residue ID of desired atoms (accepts '*' as wildcard). Can also be a list or numpy array of of int. :param atom: name of desired atom (accepts '*' as wildcard). Can also be a list or numpy array of strings. :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :param use_resname: if set to True, consider information in "res" variable as resnames, and not resids :returns: coordinates of the selected points (in a unique array) and, if get_index is set to true, a list of their indices in subunits' self.points array. ''' # extract id of units of interest if u == '*': unit_id = list(self.unit_labels.values()) else: if isinstance(u, str) or isinstance(u, int): try: unit_id = [self.unit_labels[str(u)]] except Exception as ex: raise Exception("ERROR: unit %s not found!" % u) elif isinstance(u, list) or type(u).__module__ == 'numpy': unit_id = [] for c in range(0, len(u), 1): try: unit_id.append(self.unit_labels[str(u[c])]) except Exception as ex: raise Exception("ERROR: unit %s not found!" % u[c]) else: raise Exception("ERROR: wrong type for unit selection. Should be str, int, list, or numpy") # initialize storage for indices and coordinates indices = [] pts = np.empty([0, 3]) for i in range(0, len(self.unit), 1): if i in unit_id: [pts_tmp, index_tmp] = self.unit[i].atomselect(chain, resid, atom, True, use_resname=use_resname) pts = np.concatenate((pts, pts_tmp)) indices.append(index_tmp) else: # indices of all units must be stored. If unit is not # requested, return an empty array for it indices.append([]) if get_index: return [pts, indices] else: return pts
[docs] def make_molecule(self): ''' Return a :func:`Molecule <molecule.Molecule>` object containing all the points of the assembly. Chain will indicate different units, original chain value is pushed in segment entry. :returns: :func:`Molecule <molecule.Molecule>` object ''' # create new data entry (renumber indices, reassign chain name) data = np.empty([0, 9]) atom_ccs = {} r = [] c = [] skipcharge = False for i in range(0, len(self.unit), 1): data_tmp = self.unit[i].data[[ "atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"]].values data_tmp[:, 4] = self.chain_names[i] data = np.concatenate((data, data_tmp)) # merge knowledge about CCS acquired by different molecules atom_ccs = {} for k in self.unit[i].knowledge['atom_ccs'].keys(): atom_ccs[k] = self.unit[i].knowledge['atom_ccs'][k] if len(r) == 0: r = self.unit[i].data['radius'] else: r = np.concatenate((r, self.unit[i].data['radius'])) try: if len(c) == 0: c = self.unit[i].data['charge'] else: c = np.concatenate((c, self.unit[i].data['charge'])) except Exception as ex: skipcharge = True continue data[:, 1] = np.linspace(1, len(data), len(data)).astype(int) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) # create molecule, and push created data information M = Molecule() M.add_xyz(self.get_all_xyz()) M.data = pd.DataFrame(data, index=idx, columns=cols) M.properties['center'] = M.get_center() M.knowledge['atom_ccs'] = atom_ccs M.data['radius'] = r if not skipcharge: M.data['charge'] = c return M
#def rmsd(self, ref_index, u="*", chain="*", resid="*", atom="*", align=False): # ''' # Calculate the RMSD between atoms of interest in all structure with respect of a reference structure. # supposes that all multimer subunits contain the same amount of alternative coordiantes. # These are considered as representations of monomers conformations in a possible multimer. # :param u: number of desired unit to select in the multimer # :param chain: selection of a specific chain name (accepts '*' as wildcard). Can also be a list or numpy array of strings. # :param resid: residue ID of desired atoms (accepts '*' as wildcard). Can also be a list or numpy array of of int. # :param atom: name of desired atom (accepts '*' as wildcard). Can also be a list or numpy array of strings. # :param ref_index: index of reference structure in conformations database # :param align: if True, structures will all be aligned (cannot be undone) # :returns: RMSD of all structures with respect of reference structure (in a numpy array) # ''' # # if ref_index >= self.unit[0].coordinates.shape[0]: # raise Exception("ERROR: requested frame %s as reference, but only %s frames are available!" %(ref_index, self.unit[0].coordinates.shape[0])) # # select indices of atoms of interest and call overloaded method # indices = self.atomselect(u, chain, resid, atom, get_index=True)[1] # return super(Multimer, self).rmsd(ref_index, points_indices=indices, align=align)
[docs] def get_data(self, indices, columns): ''' Return information about atom of interest (i.e., slice the data DataFrame) :param indices: list of indices :param columns: list of columns (e.g. ["resname", "resid", "chain"]) :returns: slice of molecule's data DataFrame ''' return self.data.loc[indices, columns].values
[docs] def write_pdb(self, outname): ''' Write a pdb of the multimeric assembly. :param outname: name of PDB file to generate ''' f_out = open(outname, "w") for f in range(len(self.unit[0].coordinates)): cnt = 1 # set current state to new frame for j in range(0, len(self.unit), 1): self.unit[j].set_current(f) for j in range(0, len(self.unit), 1): # get data about points and their properties from the desired # protein structure d = self.unit[j].get_pdb_data() for i in range(0, len(self.unit[j].points), 1): # create and write PDB lin if d[i][2][0].isdigit(): L = '%-6s%5i %-5s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], int(d[i][5]), float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11]) else: L = '%-6s%5i %-4s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], int(d[i][5]), float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11]) #L='%-6s%5i %-4s%-4s%1s%4i %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n'%(d[i][0], cnt, d[i][2], d[i][3], self.chain_names[j], d[i][5], d[i][6], d[i][7], d[i][8], d[i][9], d[i][10], d[i][11]) f_out.write(L) cnt += 1 f_out.write("TER\n") f_out.write("END\n") f_out.close()