Source code for molecule

# Copyright (c) 2014-2017 Matteo Degiacomi
#
# BiobOx is free software ;
# you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation ;
# either version 2 of the License, or (at your option) any later version.
# BiobOx is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY ;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with BiobOx ;
# if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
#
# Author : Matteo Degiacomi, matteothomas.degiacomi@gmail.com

import os
from copy import deepcopy
import numpy as np
import scipy.signal
import pandas as pd

# Definiton of constants for later calculations
epsilon0 = 8.8542 * 10**(-12) # m**-3 kg**-1 s**4 A**2, Permitivitty of free space
kB = 1.3806 * 10**(-23) # m**2 kg s**-2 K-1, Lattice Boltzmann constant
e = 1.602 * 10**(-19) # A s, electronic charge
m = 1 * 10**(-9) # number of nm in 1 m
c = 3.336 * 10**(-30) # conversion from debye to e m
Na = 6.022 * 10**(23) # Avagadros Number

from biobox.classes.structure import Structure
from biobox.lib import e_density

[docs]class Molecule(Structure): ''' Subclass of :func:`Structure <structure.Structure>`, allows reading, manipulating and analyzing molecular structures. ''' chain_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0') def __init__(self): ''' At instantiation, properties associated to every individual atoms are stored in a pandas Dataframe self.data. The columns of the self.data have the following names: atom, index, name, resname, chain, resid, beta, occupancy, atomtype, radius, charge. self.knowledge contains a knowledge base about atoms and residues properties. Default values are: * 'residue_mass' property stores the average mass for most common aminoacids (values from Expasy website) * 'atom_vdw' vdw radius of common atoms * 'atom_mass' mass of common atoms The knowledge base can be edited. For instance, to add information about residue "TST" mass in molecule M type: M.knowledge['mass_residue']["TST"]=142.42 ''' super(Molecule, self).__init__(r=np.array([])) # knowledge base about atoms and residues properties (entry keys: # 'residue_mass', 'atom_vdw', 'atom_, mass' can be edited) self.knowledge = {} self.knowledge['residue_mass'] = {"ALA": 71.0788, "ARG": 156.1875, "ASN": 114.1038, "ASP": 115.0886, "CYS": 103.1388, "CYX": 103.1388, "GLU": 129.1155, "GLN": 128.1307, "GLY": 57.0519, "HIS": 137.1411, "HSE": 137.1411, "HSD": 137.1411, "HSP": 137.1411, "HIE": 137.1411, "HID": 137.1411, "HIP": 137.1411, "ILE": 113.1594, "LEU": 113.1594, "LYS": 128.1741, "MET": 131.1926, "MSE": 131.1926, "PHE": 147.1766, "PRO": 97.1167, "SER": 87.0782, "THR": 101.1051, "TRP": 186.2132, "TYR": 163.1760, "VAL": 99.1326} self.knowledge['atom_vdw'] = {'H': 1.20, 'N': 1.55, 'NA': 2.27, 'CU': 1.40, 'CL': 1.75, 'C': 1.70, 'O': 1.52, 'I': 1.98, 'P': 1.80, 'B': 1.85, 'BR': 1.85, 'S': 1.80, 'SE': 1.90, 'F': 1.47, 'FE': 1.80, 'K': 2.75, 'MN': 1.73, 'MG': 1.73, 'ZN': 1.39, 'HG': 1.8, 'XE': 1.8, 'AU': 1.8, 'LI': 1.8, '.': 1.8} self.knowledge['atom_ccs'] = {'H': 1.2, 'C': 1.91, 'N': 1.91, 'O': 1.91, 'P': 1.91, 'S': 1.91, '.': 1.91} self.knowledge['atom_mass'] = {"H": 1.00794, "D": 2.01410178, "HE": 4.00, "LI": 6.941, "BE": 9.01, "B": 10.811, "C": 12.0107, "N": 14.0067, "O": 15.9994, "F": 18.998403, "NE": 20.18, "NA": 22.989769, "MG": 24.305, "AL": 26.98, "SI": 28.09, "P": 30.973762, "S": 32.065, "CL": 35.453, "AR": 39.95, "K": 39.0983, "CA": 40.078, "SC": 44.96, "TI": 47.87, "V": 50.94, "CR": 51.9961, "MN": 54.938045, "FE": 55.845, "CO": 58.93, "NI": 58.6934, "CU": 63.546, "ZN": 65.409, "GA": 69.72, "GE": 72.64, "AS": 74.9216, "SE": 78.96, "BR": 79.90, "KR": 83.80, "RB": 85.47, "SR": 87.62, "Y": 88.91, "ZR": 91.22, "NB": 92.91, "MO": 95.94, "TC": 98.0, "RU": 101.07, "RH": 102.91, "PD": 106.42, "AG": 107.8682, "CD": 112.411, "IN": 114.82, "SN": 118.71, "SB": 121.76, "TE": 127.60, "I": 126.90447, "XE": 131.29, "CS": 132.91, "BA": 137.33, "PR": 140.91, "EU": 151.96, "GD": 157.25, "TB": 158.93, "W": 183.84, "IR": 192.22, "PT": 195.084, "AU": 196.96657, "HG": 200.59, "PB": 207.2, "U": 238.03} self.knowledge['atomtype'] = {"C": "C", "CA": "C", "CB": "C", "CG": "C", "CG1": "C", "CG2": "C", "CZ": "C", "CD1": "C", "CD2": "C", "CD": "C", "CE": "C", "CE1": "C", "CE2": "C", "CE3": "C", "CZ2": "C", "CZ3": "C", "CH2": "C", "N": "N", "NH1": "N", "NH2": "N", "NZ": "N", "NE": "N", "NE1": "N", "NE2": "N", "ND1": "N", "ND2": "N", "O": "O", "OG": "O", "OG1": "O", "OG2": "O", "OD1": "O", "OD2": "O", "OE1": "O", "OE2": "O", "OH": "O", "OXT": "O", "SD": "S", "SG": "S", "H": "H", "HA": "H", "HB1": "H", "HB2": "H", "HE1": "H", "HE2": "H", "HD1": "H", "HD2": "H", "H1": "H", "H2": "H", "H3": "H", "HH11": "H", "HH12": "H", "HH21": "H", "HH22": "H", "HG1": "H", "HG2": "H", "HE21": "H", "HE22": "H", "HD11": "H", "HD12": "H", "HD13": "H", "HD21": "H", "HD22": "H", "HG11": "H", "HG12": "H", "HG13": "H", "HG21": "H", "HG22": "H", "HG23": "H", "HZ2": "H", "HZ3": "H", "HZ": "H", "HA1": "H", "HA2": "H", "HB": "H", "HD3": "H", "HG": "H", "HZ1": "H", "HE3": "H", "HB3": "H", "HH1": "H", "HH2": "H", "HD23": "H", "HD13": "H", "HE": "H", "HH": "H", "OC1": "O", "OC2": "O", "OW": "O", "HW1": "H", "HW2": "H", "CH3" : "C", "HH31" : "H", "HH32" : "H", "HH33" : "H", "C00" : "C", "C01" : "C", "C02" : "C", "C04" : "C", "C06" : "C", "C08" : "C", "H03" : "H", "H05" : "H", "H07" : "H", "H09" : "H", "H0A" : "H", "H0B" : "H", } def __add__(self, other): from biobox.classes.multimer import Multimer M = Multimer() M.load_list([self, other], ["A", "B"]) M2 = M.make_molecule() return M2
[docs] def know(self, prop): ''' return information from knowledge base :param prop: desired property to extract from knowledge base :returns: value associated to requested property, or nan if failed ''' if str(prop) in self.knowledge: return self.knowledge[str(prop)] else: raise Exception("entry %s not found in knowledge base!" % prop)
[docs] def import_pdb(self, pdb, include_hetatm=False): ''' read a pdb (possibly containing containing multiple models). Models are split according to ENDMDL and END statement. All alternative coordinates are expected to have the same atoms. After loading, the first model (M.current_model=0) will be set as active. :param pdb: PDB filename :param include_hetatm: if True, HETATM will be included (they get skipped if False) ''' try: f_in = open(pdb, "r") except Exception as ex: raise Exception('ERROR: file %s not found!' % pdb) # store filename self.properties["filename"] = pdb data_in = [] p = [] r = [] e = [] alternative = [] biomt = [] symm = [] for line in f_in: record = line[0:6].strip() # load biomatrix, if any is present if "REMARK 350 BIOMT" in line: try: biomt.append(line.split()[4:8]) except Exception as ex: raise Exception("ERROR: biomatrix format seems corrupted") # load symmetry matrix, if any is present if "REMARK 290 SMTRY" in line: try: symm.append(line.split()[4:8]) except Exception as ex: raise Exception("ERROR: symmetry matrix format seems corrupted") # if a complete model was parsed store all the saved data into # self.data entries (if needed) and temporary alternative # coordinates list if record == "ENDMDL" or record == "END": if len(alternative) == 0: # load all the parsed data in superclass data (Dataframe) # and points data structures try: #building dataframe data = np.array(data_in).astype(str) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) self.data = pd.DataFrame(data, index=idx, columns=cols) # Set the index numbers to the idx values to avoid hexadecimal counts self.data["index"] = idx except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' %pdb) # saving vdw radii try: self.data['radius'] = np.array(r) except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' %pdb) # save default charge state self.data['charge'] = np.array(e) # save 3D coordinates of every atom and restart the accumulator try: if len(p) > 0: alternative.append(np.array(p)) p = [] except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' % pdb) if record == 'ATOM' or (include_hetatm and record == 'HETATM'): # extract xyz coordinates (save in list of point coordinates) p.append([float(line[30:38]), float(line[38:46]), float(line[46:54])]) # if no complete model has been yet parsed, load also # information about atoms(resid, resname, ...) if len(alternative) == 0: w = [] # extract ATOM/HETATM statement w.append(line[0:6].strip()) w.append(line[6:12].strip()) # extract atom index w.append(line[12:17].strip()) # extract atomname w.append(line[17:21].strip()) # extract resname w.append(line[21].strip()) # extract chain name w.append(line[22:26].strip()) # extract residue id # extract occupancy try: w.append(float(line[54:60])) except Exception as ex: w.append(1.0) # extract beta factor try: # w.append("{0.2f}".format(float(line[60:66]))) w.append(float(line[60:66])) except Exception as ex: w.append(0.0) # extract atomtype try: w.append(line[76:78].strip()) except Exception as ex: w.append("") # use atomtype to extract vdw radius try: r.append(self.know('atom_vdw')[line[76:78].strip()]) except Exception as ex: r.append(self.know('atom_vdw')['.']) # assign default charge state of 0 e.append(0.0) data_in.append(w) f_in.close() # if p list is not empty, that means that the PDB file does not finish with an END statement (like the ones generated by SBT, for instance). # In this case, dump all the remaining stuff into alternate coordinates # array and (if needed) into properties dictionary. if len(p) > 0: # if no model has been yet loaded, save also information in # properties dictionary. if len(alternative) == 0: # load all the parsed data in superclass properties['data'] and # points data structures try: #building dataframe data = np.array(data_in).astype(str) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) self.data = pd.DataFrame(data, index=idx, columns=cols) # Set the index numbers to the idx values to avoid hexadecimal counts self.data["index"] = idx except Exception as ex: raise Exception('ERROR: something went wrong when saving data in %s!\nERROR: are all the columns separated?' %pdb) try: self.data['radius'] = np.array(r) except Exception as ex: raise Exception('ERROR: something went wrong when saving van der Waals radii in %s!\nERROR: are all the columns separated?' % pdb) # save default charge state self.properties['charge'] = np.array(e) # save 3D coordinates of every atom and restart the accumulator try: if len(p) > 0: alternative.append(np.array(p)) p = [] except Exception as ex: raise Exception('ERROR: something went wrong when saving coordinates in %s!\nERROR: are all the columns separated?' %pdb) # transform the alternative temporary list into a nice multiple # coordinates array if len(alternative) > 0: try: alternative_xyz = np.array(alternative).astype(float) except Exception as e: alternative_xyz = np.array([alternative[0]]).astype(float) print('WARNING: found %s models, but their atom count differs' % len(alternative)) print('WARNING: treating only the first model in file %s' % pdb) #raise Exception('ERROR: models appear not to have the same amount of atoms') self.add_xyz(alternative_xyz) else: raise Exception('ERROR: something went wrong when saving alternative coordinates in %s!\nERROR: no model was loaded... are ENDMDL statements there?' % pdb) # if biomatrix information is provided, creat if len(biomt) > 0: # test whether there are enough lines to create biomatrix # statements if np.mod(len(biomt), 3): raise Exception('ERROR: found %s BIOMT entries. A multiple of 3 is expected'%len(biomt)) b = np.array(biomt).astype(float).reshape((len(biomt) // 3, 3, 4)) self.properties["biomatrix"] = b # if symmetry information is provided, create entry in properties if len(symm) > 0: # test whether there are enough lines to create biomatrix # statements if np.mod(len(symm), 3): raise Exception('ERROR: found %s SMTRY entries. A multiple of 3 is expected'%len(symm)) b = np.array(symm).astype(float).reshape((len(symm) // 3, 3, 4)) self.properties["symmetry"] = b #correctly set types of columns requiring other than string self.data["resid"] = self.data["resid"].astype(int) self.data["index"] = self.data["index"].astype(int) self.data["occupancy"] = self.data["occupancy"].astype(float) self.data["beta"] = self.data["beta"].astype(float)
[docs] def import_pqr(self, pqr, include_hetatm=False): ''' Read a pqr (possibly containing containing multiple models). models are split according to ENDMDL and END statement. All alternative coordinates are expected to have the same atoms. After loading, the first model (M.current_model=0) will be set as active. :param pqr: PQR filename :param include_hetatm: if True, HETATM will be included (they get skipped if False) ''' try: f_in = open(pqr, "r") except Exception as ex: raise Exception('ERROR: file %s not found!' % pqr) # store filename self.properties["filename"] = pqr data_in = [] p = [] # collects coordinates for every model r = [] # vdW radii e = [] # electrostatics alternative = [] for line in f_in: record = line[0:6].strip() # if a complete model was parsed store all the saved data into # self.properties entries (if needed) and temporary alternative # coordinates list if record == "ENDMDL" or record == "END": if len(alternative) == 0: # load all the parsed data in superclass properties['data'] # and points data structures try: #building dataframe data = np.array(data_in).astype(str) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) self.data = pd.DataFrame(data, index=idx, columns=cols) self.data["index"] = idx # convert to internal numbering system except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' %pqr) # saving vdw radii try: self.data['radius'] = np.array(r) except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' %pqr) # saving electrostatics try: self.data['charge'] = np.array(e) except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' % pqr) # save 3D coordinates of every atom and restart the accumulator try: if len(p) > 0: alternative.append(np.array(p)) p = [] except Exception as ex: raise Exception('ERROR: something went wrong when loading the structure %s!\nERROR: are all the columns separated?' %pqr) if record == 'ATOM' or (include_hetatm and record == 'HETATM'): # extract xyz coordinates (save in list of point coordinates) p.append([float(line[30:38]), float(line[38:46]), float(line[46:54])]) # if no complete model has been yet parsed, load also # information about atoms(resid, resname, ...) if len(alternative) == 0: # extract charge try: # 54 is separator, 55 is plus/minus e.append(float(line[54:62])) except Exception as ex: e.append(0.0) # extract vdW radius try: r.append(float(line[62:69])) except Exception as ex: r.append(self.know('atom_vdw')['.']) # initialize list w = [] # extract ATOM/HETATM statement w.append(line[0:6].strip()) w.append(line[6:11].strip()) # extract atom index w.append(line[12:17].strip()) # extract atomname w.append(line[17:20].strip()) # extract resname w.append(line[21].strip()) # extract chain name w.append(line[22:26].strip()) # extract residue id # extract occupancy w.append('1') # extract beta factor w.append('0') # extract atomtype from atomname in BMRB notation # http://www.bmrb.wisc.edu/ref_info/atom_nom.tbl w.append(line[12:17].strip()[0]) # w.append(line[76:78].strip()) data_in.append(w) f_in.close() # if p list is not empty, that means that the pqr file does not finish with an END statement (like the ones generated by SBT, for instance). # In this case, dump all the remaining stuff into alternate coordinates # array and (if needed) into properties dictionary. if len(p) > 0: # if no model has been yet loaded, save also information in # properties dictionary. if len(alternative) == 0: # load all the parsed data in superclass properties['data'] and # points data structures try: #building dataframe data = np.array(data_in).astype(str) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) self.data = pd.DataFrame(data, index=idx, columns=cols) self.data["index"] = idx # convert to internal numbering system except Exception as ex: raise Exception('ERROR: something went wrong when saving data in %s!\nERROR: are all the columns separated?' % pqr) try: self.data['radius'] = np.array(r) except Exception as ex: raise Exception('ERROR: something went wrong when saving van der Waals radii in %s!\nERROR: are all the columns separated?' %pqr) # save 3D coordinates of every atom and restart the accumulator try: if len(p) > 0: alternative.append(np.array(p)) p = [] except Exception as ex: raise Exception('ERROR: something went wrong when saving coordinates in %s!\nERROR: are all the columns separated?' %pqr) # transform the alternative temporary list into a nice multiple # coordinates array if len(alternative) > 0: try: alternative_xyz = np.array(alternative).astype(float) except Exception as ex: alternative_xyz = np.array([alternative[0]]).astype(float) print('WARNING: found %s models, but their atom count differs' % len(alternative)) print('WARNING: treating only the first model in file %s' % pqr) #raise Exception('ERROR: models appear not to have the same amount of atoms') self.add_xyz(alternative_xyz) else: raise Exception('ERROR: something went wrong when saving alternative coordinates in %s!\nERROR: no model was loaded... are ENDMDL statements there?' % pqr) #correctly set types of columns requiring other than string self.data["resid"] = self.data["resid"].astype(int) self.data["index"] = self.data["index"].astype(int) self.data["occupancy"] = self.data["occupancy"].astype(float) self.data["beta"] = self.data["beta"].astype(float)
[docs] def assign_atomtype(self): ''' guess atomtype from atom names ''' a_type = [] for i in range(0, len(self.data), 1): atom = self.data["name"].values[i] try: a_type.append(self.knowledge["atomtype"][atom]) except Exception as ex: a_type.append("") self.data["atomtype"] = a_type
[docs] def get_vdw_density(self, buff=3, step=0.5, kernel_half_width=10): ''' generate density map from points based on the van der Waals readius of the atoms :param buff: Buffer used to create the boundaries of the density map :param step: Stepsize for creating the density object :param kernel_half_width: Kernel half width of the gaussian kernel, will be scaled by atom specific sigma :returns: :func:`Density <density.Density>` object, containing a density map ''' atomdata = [["C", 1.7, 1.455, 0.51], ["H", 1.2, 0.72, 0.25], ["O", 1.52, 1.15, 0.42], ["S", 1.8, 1.62, 0.54], ["N", 1.55, 1.2, 0.44]] dens = [] for d in atomdata: # select atomtype and point only to those coordinates pts = self.points[[self.data["atomtype"].values == d[0]]] # if there are atoms from a certain type if len(pts) > 0: # use the standard density calculation with a atom-type # specific sigma value D = self.get_density(buff=buff, step=step, kernel_half_width=kernel_half_width, sigma=d[2]) # export the np-array density map for summing d_tmp = D.properties["density"] # print 'atom %s has a maximum density of %s, occurs %s times in the protein and the density map has a shape of %s'%(d[0], np.max(d_tmp), len(pts), d_tmp.shape) # print 'one entry in d_tmp: \n%s'%d_tmp[35][27][20] # print np.unravel_index(d_tmp.argmax(), d_tmp.shape) # initialize the dens-list if it's empty if len(dens) == 0: dens = deepcopy(d_tmp) # sum up the densities from all atom types else: dens += d_tmp # dens is a 3d numpy array with point intensities # print 'one entry in dens is: \n%s'%dens[35][27][20] from biobox.classes.density import Density D = Density() D.properties['density'] = dens D.properties['size'] = np.array(dens.shape) D.properties['origin'] = np.mean(pts, axis=0) - step * np.array(dens.shape) / 2.0 D.properties['delta'] = np.identity(3) * step D.properties['format'] = 'dx' D.properties['filename'] = '' D.properties["sigma"] = np.std(dens) return D
[docs] def get_electrostatics(self, step=1.0, buff=3, threshold=0.01, vdw_kernel_half_width=5, elect_kernel_half_width=12, chain='*', clear_mass=True): ''' generate electrostatics map from points :param step: size of cubic voxels, in Angstrom :param buff: padding to add at points cloud boundaries :param threshold: Threshold used for removing mass occupied space from the electron density map :param vdw_kernel_half_width: kernel half width, in voxels :param elect_kernel_half_width: kernel half width, in voxels :param chain: select chain to use, default all chains :returns: positive :func:`Density <density.Density>` object :returns: negative :func:`Density <density.Density>` object :returns: mass density object ''' pts, idx = self.atomselect(chain, '*', '*', get_index=True) try: # numpy array of charges [c1, c2, c3, ...] charges = self.data['charge'].values[idx] except Exception as e: raise Exception('ERROR: No charges associated with %s' % self) charges = np.reshape(charges, (len(charges), 1)) # numpy 2d-array of shape (:, 4): [[x, y, z, charge], [x, y, z, # charge], ...] c_atoms = np.hstack((pts, charges)) k = 8.9875517873681764 # Coulomb's constant in nN # rectangular box boundaries bnds = np.array([[np.min(pts[:, 0]) - buff, np.max(pts[:, 0]) + buff], [np.min(pts[:, 1]) - buff, np.max(pts[:, 1]) + buff], [np.min(pts[:, 2]) - buff, np.max(pts[:, 2]) + buff]]) xax = np.arange(bnds[0, 0], bnds[0, 1] + step, step) yax = np.arange(bnds[1, 0], bnds[1, 1] + step, step) zax = np.arange(bnds[2, 0], bnds[2, 1] + step, step) # create empty box d = np.zeros((len(xax), len(yax), len(zax))) # place Kronecker deltas in mesh grid -> discretes point coordinates # into mesh grid with an intensity of charge for atom in c_atoms: xpos = np.argmin(np.abs(xax - atom[0])) ypos = np.argmin(np.abs(yax - atom[1])) zpos = np.argmin(np.abs(zax - atom[2])) d[xpos, ypos, zpos] = atom[3] # initialize 3d kernel_box # how many steps are needed to reach kernel half width in angstroms l_kernel = int(2 * elect_kernel_half_width / step) # the kernel is an empty box... kernel = np.zeros((l_kernel, l_kernel, l_kernel)) it = np.nditer(kernel, flags=['multi_index']) while not it.finished: x_index = it.multi_index[0] # indices y_index = it.multi_index[1] z_index = it.multi_index[2] x_coord = x_index * step - (l_kernel / 2) # real space coordinates y_coord = y_index * step - (l_kernel / 2) z_coord = z_index * step - (l_kernel / 2) distance = np.sqrt(x_coord *x_coord + y_coord *y_coord + z_coord *z_coord) if distance > 0.9: # ... where a hyperbola will be created only outside of H-vdW and inside of relevant kernel-half-width distance kernel[x_index, y_index, z_index] = k / distance it.iternext() # convolve point mesh with 3d coulomb hyperbola e = scipy.signal.fftconvolve(d, kernel, mode='same') # define mass-occupied space # mass_density is Density-object, buff=buff makes shure that the # density maps have the same dimensions mass_density = self.get_vdw_density( buff=buff, step=step, kernel_half_width=vdw_kernel_half_width) if clear_mass: d_map = mass_density.return_density_map() # initialize counter to count the amount of removed points i = 0 # initialize the iterator for said function it = np.nditer(e, flags=['multi_index']) while not it.finished: x_index = it.multi_index[0] # indices y_index = it.multi_index[1] z_index = it.multi_index[2] x_coord = xax[x_index] # real space coordinates y_coord = yax[y_index] z_coord = zax[z_index] if d_map[x_index, y_index, z_index] > threshold: i += 1 e[x_index, y_index, z_index] = 0 it.iternext() print('removed %s points due to van der Waals clashing' % i) # split the density into two maps e_pos = deepcopy(e) e_neg = deepcopy(e) e_pos[np.where(e_pos < 0)] = 0 e_neg[np.where(e_neg >= 0)] = 0 # changes sign of negative array for better visualization e_neg = np.negative(e_neg) # prepare density data structure for both positive and negative maps at # once from biobox.classes.density import Density D_pos = Density() D_neg = Density() D_pos.properties['density'] = e_pos D_neg.properties['density'] = e_neg D_pos.properties['size'] = D_neg.properties['size'] = np.array(e.shape) D_pos.properties['origin'] = D_neg.properties['origin'] = np.mean(self.points, axis=0) - step * np.array(e.shape) / 2.0 D_pos.properties['delta'] = D_neg.properties['delta'] = np.identity(3) * step D_pos.properties['format'] = D_neg.properties['format'] = 'dx' D_pos.properties['filename'] = D_neg.properties['filename'] = '' D_pos.properties["sigma"] = np.std(e_pos) D_neg.properties["sigma"] = np.std(e_neg) return D_pos, D_neg, mass_density
def _apply_matrices(self, mats): ''' replicate molecule according to list of transformation matrices ''' xyzall = [] #data = [] # create new data entry (renumber indices, reassign chain name) data = np.empty([0, 9]) # replicate original coordinates applying transformations cnt = 0 for m in mats: xyz2 = self.points.copy() + m[:, 3] # translate xyz2 = np.dot(xyz2, m[:, 0:3]) # rotate xyzall.extend(xyz2) # merge #assign a specific name to the new chain #newdata = deepcopy(self.data) #newdata["chain"] = self.chain_names[cnt] #if cnt == 0: # data = [newdata] #else: # data.append(newdata) data_tmp = self.data[[ "atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"]].values data_tmp[:, 4] = self.chain_names[cnt] data = np.concatenate((data, data_tmp)) cnt += 1 # temporary vectorized hexadecimal maker, in case there are more than # 9999 atoms def dohex(number): return hex(number).split('x')[1] vhex = np.vectorize(dohex) indices = np.linspace(1, len(data), len(data)).astype(int) idx = vhex(indices) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] M2 = Molecule() M2.coordinates = np.array([xyzall]) M2.data = pd.DataFrame(data, index=idx, columns=cols) M2.properties['center'] = M2.get_center() return M2
[docs] def apply_biomatrix(self): ''' if biomatrix information is provided, generate a new molecule with all symmetry operators applied. :returns: new Molecule containing several copies of the current Molecule, arranged according to BIOMT statements contained in pdb, or -1 if no transformation matrix is provided ''' # if no biomatrix statement is found, return with error if "biomatrix" not in self.properties: raise Exception("ERROR: no biomatrix found in pdb %s" %self.properties["filename"]) return self._apply_matrices(self.properties["biomatrix"])
[docs] def apply_symmetry(self): ''' if symmetry information is provided, generate a new molecule with all symmetry operators applied. :returns: new Molecule containing several copies of the current Molecule, arranged according to SMTRY statements contained in pdb, or -1 if no transformation matrix is provided ''' # if no symmetry statement is found, return with error if "symmetry" not in self.properties: raise Exception("ERROR: no symmetry matrix found in pdb %s" %self.properties["filename"]) return self._apply_matrices(self.properties["symmetry"])
[docs] def import_gro(self, filename): ''' read a gro possibly containing multiple structures. :param filename: name of .gro file to import ''' if not os.path.isfile(filename): raise Exception("ERROR: %s not found!" % filename) self.clear() # print "\n> loading %s..."%filename fin = open(filename, "r") line = fin.readline() d_data = [] b = [] while line: cnt = 0 d = [] atoms = int(fin.readline()) d_data = [] while cnt < atoms: w = fin.readline() # Read array as defined by .gro style characters (res int, res, atomtype, int, x_coord, y_coord, z_coord) w = [w[0:5].strip(), w[5:10].strip(), w[10:15].strip(), w[15:20].strip(), w[20:28].strip(), w[28:36].strip(), w[36:44].strip()] resname = w[1]; resnumber=w[0] # read data useful for indexing (guess what is missing) d_data.append(["ATOM", w[3], w[2], resname, "A", resnumber, "0.0", "0.0", ""]) d.append([w[4], w[5], w[6]]) cnt += 1 # add one conformation (in Angstrom) and store its box size in # temporary list self.add_xyz(np.array(d).astype(float) * 10) b.append(fin.readline().split()) line = fin.readline() # attempt to get header of next frame # store data information and box size for every frame self.properties['box'] = np.array(b).astype(float) * 10 #building dataframe data = np.array(d_data).astype(str) cols = ["atom", "index", "name", "resname", "chain", "resid", "beta", "occupancy", "atomtype"] idx = np.arange(len(data)) self.data = pd.DataFrame(data, index=idx, columns=cols) #add additional information about van der waals radius and atoms charge self.data['radius'] = np.ones(len(d_data)) * self.know('atom_vdw')['.'] self.data['charge'] = np.zeros(len(d_data)) fin.close()
[docs] def get_atoms_ccs(self): ''' return array with atomic CCS radii of every atom in molecule :returns: CCS in Angstrom^2 ''' if "atom_ccs" in self.data.columns: return self.data["atom_ccs"] ccs = np.ones(len(self.points)) * self.know("atom_ccs")["."] for e in self.know("atom_ccs").keys(): if "e" != ".": ccs[self.data["atomtype"].values == e] = self.knowledge["atom_ccs"][e] self.data["atom_ccs"] = ccs return ccs
[docs] def get_data(self, indices=[], columns=[]): ''' Return information about atoms of interest (i.e., slice the data DataFrame) :param indices: list of indices, if not provided all atom data is returned :param columns: list of columns (e.g. ["resname", "resid", "chain"]), if not provided all columns are returned :returns: numpy array containing a slice of molecule's data ''' if len(indices) == 0 and len(columns) == 0: return self.data.values elif len(indices) == 0 and len(columns) != 0: return self.data[columns].values elif len(indices) != 0 and len(columns) == 0: return self.data.loc[indices].values else: return self.data.loc[indices, columns].values
[docs] def set_data(self, value, indices=[], columns=[]): ''' Return information about atoms of interest (i.e., slice the data DataFrame) :param indices: list of indices, if not provided all atom data is returned :param columns: list of columns (e.g. ["resname", "resid", "chain"]), if not provided all columns are returned :returns: numpy array containing a slice of molecule's data ''' if len(indices) == 0 and len(columns) == 0: raise Exception("indices, columns or both should be provided") elif len(indices) == 0 and len(columns) != 0: self.data[columns] = value elif len(indices) != 0 and len(columns) == 0: self.data.loc[indices] = value else: self.data.loc[indices, columns] = value
[docs] def query(self, query_text, get_index=False): ''' Select specific atoms in a multimer un the basis of a text query. :param query_text: string selecting atoms of interest. Uses the pandas query syntax, can access all columns in the dataframe self.data. :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :returns: coordinates of the selected points (in a unique array) and, if get_index is set to true, a list of their indices in subunits' self.points array. ''' idx = self.data.query(query_text).index.values if get_index: return [self.points[idx], idx] else: return self.points[idx]
[docs] def atomselect(self, chain, res, atom, get_index=False, use_resname=False): ''' Select specific atoms in the protein providing chain, residue ID and atom name. :param chain: selection of a specific chain name (accepts * as wildcard). Can also be a list or numpy array of strings. :param res: residue ID of desired atoms (accepts * as wildcard). Can also be a list or numpy array of of int. :param atom: name of desired atom (accepts * as wildcard). Can also be a list or numpy array of strings. :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :param use_resname: if set to True, consider information in "res" variable as resnames, and not resids :returns: coordinates of the selected points and, if get_index is set to true, their indices in self.points array. ''' # chain name boolean selector if isinstance(chain, str): if chain == '*': chain_query = np.array([True] * len(self.points)) else: chain_query = self.data["chain"].values == chain elif isinstance(chain, list) or type(chain).__module__ == 'numpy': chain_query = self.data["chain"].values == chain[0] for c in range(1, len(chain), 1): chain_query = np.logical_or(chain_query, self.data["chain"].values == chain[c]) else: raise Exception("ERROR: wrong type for chain selection. Should be str, list, or numpy") if isinstance(res, str): if res == '*': res_query = np.array([True] * len(self.points)) elif use_resname: res_query = self.data["resname"].values == res else: res_query = self.data["resid"].values == res elif isinstance(res, int): if use_resname: res_query = self.data["resname"].values == str(res) else: res_query = self.data["resid"].values == res elif isinstance(res, list) or type(res).__module__ == 'numpy': if use_resname: res_query = self.data["resname"].values == str(res[0]) else: res_query = self.data["resid"].values == res[0] for r in range(1, len(res), 1): if use_resname: res_query = np.logical_or(res_query, self.data["resname"].values == str(res[r])) else: res_query = np.logical_or(res_query, self.data["resid"].values == res[r]) else: raise Exception("ERROR: wrong type for resid selection. Should be int, list, or numpy") # atom name boolean selector if isinstance(atom, str): if atom == '*': atom_query = np.array([True] * len(self.points)) else: atom_query = self.data["name"].values == atom elif isinstance(atom, list) or type(atom).__module__ == 'numpy': atom_query = self.data["name"].values == atom[0] for a in range(1, len(atom), 1): atom_query = np.logical_or(atom_query, self.data["name"].values == atom[a]) else: raise Exception("ERROR: wrong type for atom selection. Should be str, list, or numpy") # slice data array and return result (colums 5 to 7 contain xyz coords) query = np.logical_and(np.logical_and(chain_query, res_query), atom_query) if get_index: return [self.points[query], np.where(query == True)[0]] else: return self.points[query]
[docs] def atomignore(self, chain, res, atom, get_index=False, use_resname=False): ''' Select specific atoms that do not match a specific query (chain, residue ID and atom name). Useful to remove from a molecule atoms unwanted for further analysis, alternative conformations, etc... :param chain: chain name (accepts * as wildcard). Can also be a list or numpy array of strings. :param res: residue ID (accepts * as wildcard). Can also be a list or numpy array of of int. :param atom: atom name (accepts * as wildcard). Can also be a list or numpy array of strings. :param get_index: if set to True, returns the indices of atoms in self.points array (and self.data) :param use_resname: if set to True, consider information in "res" variable as resnames, and not resids :returns: coordinates of the selected points not matching the query, if get_index is set to true, their indices in self.points array. ''' #extract indices of atoms matching the query idxs = self.atomselect(chain, res, atom, get_index=True, use_resname=use_resname)[1] #invert the selection idxs2 = [] for i in range(len(self.points)): if i not in idxs: idxs2.append(i) if get_index: return [self.points[idxs2], np.array(idxs2)] else: return self.points[idxs2]
[docs] def same_residue(self, index, get_index=False): ''' Select atoms having the same residue and chain as a given atom (or list of atoms) :param index indices: of atoms of choice (integer of list of integers) :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :returns: coordinates of the selected points and, if get_index is set to true, their indices in self.points array. ''' D = self.data.values l = D[index] if len(l.shape) == 1: l = l.reshape(1, len(l)) test = np.logical_and(D[:, 4] == l[:, 4], D[:, 5] == l[:, 5]) idxs = np.where(test)[0] if len(idxs) > 0: pts = self.points[idxs] else: pts = [] if get_index: return pts, idxs else: return pts
[docs] def same_residue_unique(self, index, get_index=False): ''' Select atoms having the same residue and chain as a given atom (or list of atoms) :param index: indices of atoms of choice (integer of list of integers) :param get_index: if set to True, returns the indices of selected atoms in self.points array (and self.data) :returns: coordinates of the selected points and, if get_index is set to true, their indices in self.points array. ''' try: test = len(index) # this should fail if index is a number idlist = index except Exception as e: idlist = [index] D = self.data.values pts = [] idxs = [] for i in idlist: done = False j = 0 # starting from same point while not done: if i - j < 0: done = True elif D[i, 4] == D[i - j, 4] and D[i, 5] == D[i - j, 5]: if len(idxs) != 0 and i - j not in idxs: pts.append(self.points[i - j]) idxs.append(i - j) elif i - j not in idxs: pts = [self.points[i - j].copy()] idxs = [i - j] j += 1 else: done = True j = 1 done = False while not done: if i + j == len(self.points): done = True elif D[i, 4] == D[i + j, 4] and D[i, 5] == D[i + j, 5]: if len(idxs) != 0 and i + j not in idxs: pts.append(self.points[i + j]) idxs.append(i + j) elif i + j not in idxs: pts = [self.points[i + j].copy()] idxs = [i + j] j += 1 else: done = True if get_index: return np.array(pts), np.array(idxs) else: return np.array(pts)
[docs] def get_subset(self, idxs, conformations=[]): ''' Return a :func:`Molecule <molecule.Molecule>` object containing only the selected atoms and frames :param ixds: atoms to extract :param conformations: frames to extract (by default, all) :returns: :func:`Molecule <molecule.Molecule>` object ''' # if a subset of all available frames is requested to be written, # select them first if len(conformations) == 0: frames = range(0, len(self.coordinates), 1) else: if np.max(conformations) < len(self.coordinates): frames = conformations else: raise Exception("ERROR: requested coordinate index %s, but only %s are available" %(np.max(conformations), len(self.coordinates))) idx = np.arange(len(idxs)) # create molecule, and push created data information M = Molecule() postmp = self.coordinates[:, idxs] M.coordinates = postmp[frames] M.data = self.data.loc[idxs] M.data = M.data.reset_index(drop=True) M.data["index"] = idx M.current = 0 M.points = M.coordinates[M.current] M.properties['center'] = M.get_center() return M
[docs] def guess_chain_split(self, distance=3, use_backbone=True): ''' reassign chain name, using distance cutoff (cannot be undone). If two consecutive atoms are beyond a cutoff, a new chain is assigned. :param distance: distance cutoff distanceR: no atomtype found! :param use_backbone: if True, splitting will be performed considering backbone atoms (N and C), all atoms in a sequence otherwise ''' # wipe current chain assignment self.data["chain"] = "" # identify different chains intervals = [0] if not use_backbone: for i in range(len(self.coordinates[0]) - 1): dist = np.sqrt(np.dot(self.points[i] - self.points[i + 1], self.points[i] - self.points[i + 1])) if dist > distance: intervals.append(i + 1) else: #aminoacids start with N. Find where a C is too far from the next N. posN, idxN = self.atomselect("*", "*", "N", get_index=True) posC = self.atomselect("*", "*", "C") for i in range(len(idxN)-1): dist = np.sqrt(np.dot(posC[i] - posN[i+1], posC[i] - posN[i+1])) if dist > distance: intervals.append(idxN[i+1]) intervals.append(len(self.coordinates[0])) # separate chains for i in range(len(intervals) - 1): thepos = i % len(self.chain_names) self.data.loc[intervals[i]:intervals[i + 1], "chain"] = self.chain_names[thepos] return len(intervals) - 1, intervals
[docs] def get_pdb_data(self, index=[]): ''' aggregate data and point coordinates, and return in a unique data structure Returned data is a list containing strings for points data and floats for point coordinates in the same order as a pdb file, i.e. ATOM/HETATM, index, name, resname, chain name, residue ID, x, y, z, occupancy, beta factor, atomtype. :returns: list aggregated data and coordinates for every point, as string. ''' if len(index) == 0: index = range(0, len(self.points), 1) # create a list containing all infos contained in pdb (point # coordinates and properties) d = [] for i in index: d.append([self.data["atom"].values[i], self.data["index"].values[i], self.data["name"].values[i], self.data["resname"].values[i], self.data["chain"].values[i], self.data["resid"].values[i], self.points[i, 0], self.points[i, 1], self.points[i, 2], self.data["beta"].values[i], self.data["occupancy"].values[i], self.data["atomtype"].values[i]]) return d
[docs] def write_pdb(self, outname, conformations=[], index=[]): ''' overload superclass method for writing (multi)pdb. :param outname: name of pdb file to be generated. :param index: indices of atoms to write to file. If empty, all atoms are returned. Index values obtaineable with a call like: index=molecule.atomselect("A", [1, 2, 3], "CA", True)[1] :param conformations: list of conformation indices to write to file. By default, a multipdb with all conformations will be produced. ''' # store current frame, so it will be reestablished after file output is # complete currentbkp = self.current # if a subset of all available frames is requested to be written, # select them first if len(conformations) == 0: frames = range(0, len(self.coordinates), 1) else: if np.max(conformations) < len(self.coordinates): frames = conformations else: raise Exception("ERROR: requested coordinate index %s, but only %s are available" %(np.max(conformations), len(self.coordinates))) f_out = open(outname, "w") for f in frames: # get all informations from PDB (for current conformation) in a list self.set_current(f) d = self.get_pdb_data(index) # Build our hexidecimal array if num. of atoms > 99999 idx_val = np.arange(1, len(d) + 1, 1) if len(idx_val) > 99999: vhex = np.vectorize(hex) idx_val = vhex(idx_val) # convert index values to hexidecimal idx_val = [num[2:] for num in idx_val] # remove 0x at start of hexidecimal number for i in range(0, len(d), 1): # create and write PDB line if d[i][2][0].isdigit(): L = '%-6s%5s %-5s%-4s%1s%4s %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], idx_val[i], d[i][2], d[i][3], d[i][4], d[i][5], float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11]) else: L = '%-6s%5s %-4s %-4s%1s%4s %8.3f%8.3f%8.3f%6.2f%6.2f %2s\n' % (d[i][0], idx_val[i], d[i][2], d[i][3], d[i][4], d[i][5], float(d[i][6]), float(d[i][7]), float(d[i][8]), float(d[i][9]), float(d[i][10]), d[i][11]) f_out.write(L) f_out.write("END\n") f_out.close() self.set_current(currentbkp) return
[docs] def write_gro(self, outname, conformations=[], index=""): ''' write structure(s) in .gro format. :param outname: name of .gro file to be generated. :param index: indices of atoms to write to file. If empty, all atoms are returned. Index values obtaineable with a call like: index=molecule.atomselect("A", [1, 2, 3], "CA", True)[1] :param conformations: list of conformation indices to write to file. By default, all conformations will be returned. ''' # store current frame, so it will be reestablished after file output is # complete currentbkp = self.current # if a subset of all available frames is requested to be written, # select them first if len(conformations) == 0: frames = range(0, len(self.coordinates), 1) else: if np.max(conformations) < len(self.coordinates): frames = conformations else: raise Exception("ERROR: requested coordinate index %s, but only %s are available" %(np.max(conformations), len(self.coordinates))) f_out = open(outname, "w") for f in frames: # get all informations from PDB (for current conformation) in a # list self.set_current(f) # ATOM/HETATM, index, atom name, resname, chain name, residue ID, x, # y, z, beta factor, occupancy, atomtype d = self.get_pdb_data(index) f_out.write("%s\n" % outname.split(".")[0]) f_out.write("%s\n" % len(d)) for i in range(0, len(d), 1): # create and write .gro line L = '%5d%-5s%5s%5d%8.3f%8.3f%8.3f\n' % (int(d[i][5]), d[i][3], d[i][2], int(d[i][1]), float(d[i][6]) / 10.0, float(d[i][7]) / 10.0, float(d[i][8]) / 10.0) f_out.write(L) if "box" in self.properties: b = self.properties["box"][f] / 10.0 else: minpos = np.min(self.points, axis=0) / 10.0 b = np.max(self.points, axis=0) - minpos / 10.0 formatting = "" for item in b: formatting+="%10.5f" formatting+="\n" f_out.write(formatting%tuple(b)) f_out.close() self.set_current(currentbkp) return
[docs] def beta_factor_from_rmsf(self, indices=-1, step=1): ''' estimate atoms beta factor on the base of their RMSF. :param indices: indices of atoms of interest. If not set all atoms will be considered. :param step: timestep between two conformations (useful when using conformations extracted from molecular dynamics) ''' try: rmsf = self.rmsf(indices, step) return 8.0 * (np.pi**2) * (rmsf**2) / 3.0 except Exception as ex: raise Exception('ERROR: could not calculate RMSF!')
[docs] def rmsf_from_beta_factor(self, indices=[]): ''' calculate RMSF from atoms beta factors. :param indices: indices of atoms of interest. If not set all atoms will be considered. ''' try: if len(indices) == 0: b = self.data["beta"].values else: b = self.data["beta"].values[indices] return np.sqrt(b * 3 / (8 * np.pi * np.pi)) except Exception as ex: raise Exception('ERROR: beta factors missing?')
[docs] def get_mass_by_residue(self, skip_resname=[]): ''' Compute protein mass using residues (i.e. account also for atoms not present in the structure) Sum the average mass all every residue (using a knowledge base of atom masses in Dalton) The knowledge base can be expanded or edited by adding entries to the molecule's mass dictionary, e.g. to add the residue "TST" mass in molecule M type: M.knowledge['mass_residue']["TST"]=142.42 :param skip_resname: list of resnames to skip. Useful to exclude ions water or other ligands from the calculation. :returns: mass of molecule in Dalton ''' #@todo mass of N and C termini to add for every chain mass = 0 chains = np.unique(self.data["chain"].values) for chainname in chains: # for every chain, get a list of all its (unique) resids indices = self.atomselect(chainname, "*", "*", True)[1] resids = np.unique(self.data['resid'].values[indices]) for r in resids: # for every residue in the chain, get the index of its first # atom, and extract its associated resname index = self.atomselect(chainname, int(r), "*", True)[1] resname = self.data['resname'].values[index[0]] if resname not in skip_resname: try: # add mass of residue to total mass mass += self.know('residue_mass')[resname] except Exception as ex: #@todo: if residue is not known, why not summing constituent atoms masses, warning the user that it's an estimation? raise Exception("ERROR: mass for resname %s is unknown!\nInsert a key in protein\'s masses dictionary knowledge['residue_mass'] and retry!\nex.: protein.knowledge['residue_mass'][\"TST\"]=142.42" %resname) return mass
[docs] def get_mass_by_atom(self, skip_resname=[]): ''' compute protein mass using atoms in pdb sum the mass of all atoms (using a knowledge base of atom masses in Dalton) The knowledge base can be expanded or edited by adding or editing entries to the molecule's mass dictionary, e.g. to add the atom "PI" mass in molecule M type: M.knowledge['atom_mass']["PI"]=3.141592 :param skip_resname: list of resnames to skip. Useful to exclude ions water or other ligands from the calculation. :returns: mass of molecule in Dalton ''' mass = 0 for i in range(0, len(self.data), 1): resname = self.data["resname"].values[i] atomtype = self.data["atomtype"].values[i] if resname not in skip_resname: try: mass += self.know('atom_mass')[atomtype] except Exception as e: if atomtype == "": print(self.data.values[i:i+40]) raise Exception("ERROR: no atomtype found!") else: raise Exception("ERROR: mass for atom %s is unknown!\nInsert a key in protein\'s masses dictionary knowledge['atom_mass'] and retry!\nex.: protein.knowledge['atom_mass'][\"PI\"]=3.141592" %atomtype) return mass
[docs] def s2(self, atomname1="N", atomname2="H"): ''' compute s2, given two atoms defining the vector of interest. :param atomname1: name of the first atom :param atomname2: name of the second atom :returns: data numpy array containing information about residues for which measuring has been performed (i.e.[chain, resid]) :returns: s2 s2 of residues for which both provided input atoms have been found ''' Nidx = self.atomselect("*", "*", atomname1, get_index=True)[1] Hidx = self.atomselect("*", "*", atomname2, get_index=True)[1] if len(Nidx) == 0: raise Exception("ERROR: no atom name %s found!"%atomname1) if len(Hidx) == 0: raise Exception("ERROR: no atom name %s found!"%atomname2) Ndata = self.data.loc[Nidx, ["chain", "resid"]].values Hdata = self.data.loc[Hidx, ["chain", "resid"]].values a1 = [] a2 = [] d = [] for i in range(0, len(Ndata), 1): j = np.where( np.logical_and( Hdata[:, 0] == Ndata[i, 0], Hdata[:, 1] == Ndata[i, 1]))[0] if len(j) == 1: idx1 = Nidx[i] idx2 = Hidx[j[0]] a1.append(self.coordinates[:, idx1]) a2.append(self.coordinates[:, idx2]) d.append(Ndata[i]) atoms1 = np.array(a1) atoms2 = np.array(a2) data = np.array(d) # iterate over every residue s2_summary = [] for j in range(atoms1.shape[0]): dx = atoms2[j, :, 0] - atoms1[j, :, 0] dy = atoms2[j, :, 1] - atoms1[j, :, 1] dz = atoms2[j, :, 2] - atoms1[j, :, 2] # create list of unit vectors dnorm = np.sqrt(dx**2 + dy**2 + dz**2) dx /= dnorm dy /= dnorm dz /= dnorm d = 0 d += (np.sum(dx * dx) / atoms1.shape[1])**2 d += (np.sum(dx * dy) / atoms1.shape[1])**2 d += (np.sum(dx * dz) / atoms1.shape[1])**2 d += (np.sum(dy * dx) / atoms1.shape[1])**2 d += (np.sum(dy * dy) / atoms1.shape[1])**2 d += (np.sum(dy * dz) / atoms1.shape[1])**2 d += (np.sum(dz * dx) / atoms1.shape[1])**2 d += (np.sum(dz * dy) / atoms1.shape[1])**2 d += (np.sum(dz * dz) / atoms1.shape[1])**2 s2_summary.append(0.5 * (3 * d - 1)) s2 = np.array(s2_summary) return data, s2
[docs] def get_secondary_structure(self, dssp_path=''): ''' compute the protein's secondary structure, calling DSSP :param dssp_path: DSSP executable (path and filename). If not provided, the default behaviour is to seek for this information in the environment variable DSSPPATH :returns: numpy array of characters, with one-letter-coded secondary sctructure according to DSSP. ''' #dssp="~/bin/dssp-2.0.4-linux-amd64" if dssp_path == '': try: dssp_path = os.environ['DSSPPATH'] except KeyError: raise Exception("DSSPPATH environment variable undefined") # generate temporary PDB and calculate secondary structure using DSSP self.write_pdb("tmp.pdb", conformations=[self.current]) #TMP: assign all atoms to structure #subprocess.check_call('~/bin/amber16_tmp/bin/tleap -f build > /dev/null', shell=True) try: import subprocess subprocess.check_call("%s -i tmp.pdb -o result.dssp"%dssp_path, shell=True) fin=open("result.dssp","r") except Exception as e: raise Exception("Could not calculate secondary structure! %s"%e) readit=False secstruct=[] for line in fin: if readit: try: ss = line[16] if line[16] == " ": ss = "-" secstruct.append(ss) except: continue if "#" in line: readit=True fin.close() # clean temporary files os.remove("result.dssp") os.remove("tmp.pdb") return np.array(secstruct) #(secstruct[0:210])
[docs] def get_couples(self, idx, cutoff): ''' given a list of indices, compute the all-vs-all distance and return only couples below a given cutoff distance useful for the detection of disulfide bridges or linkable sites via cross-linking (approximation, supposing euclidean distances)' :param idx: indices of atoms to check. :param cutoff: minimal distance to consider a couple as linkable. :returns: nx3 numpy array containing, for every valid connection, id of first atom, id of second atom and distance between the two. ''' import biobox.measures.interaction as I points1 = self.get_xyz()[idx] dist = I.distance_matrix(points1, points1) couples = I.get_neighbors(dist, cutoff) res = [] for c in couples.transpose(): if c[0] > c[1]: res.append([idx[c[0]], idx[c[1]], dist[c[0], c[1]]]) return np.array(res)
[docs] def match_residue(self, M2, sec = 5): ''' Compares two bb.Molecule() peptide strands and returns the resids within both peptides when the two are homogenous beyond a certain secondary structure threashold. The default is 5 amino acids (given by sec) in a row must be identical Useful when aligning PDB structures that have been crystallised separately - so one may be missing the odd residue or have a few extra at the end. :param M2: The second bb.Molecule() to compare with :param sec: Number of consecutive amino acids in a row that must match before resid's are recorded ''' # First run the match residue using the expected inputs M1_res, M2_res = self._match_residue_maths(M2, sec = sec) # Import a check to see if we've correctly counted the residues (e.g. in homodimer case) if np.shape(np.unique(M1_res)) != np.shape(np.unique(M2_res)): M2_res, M1_res = M2._match_residue_maths(self, sec = sec) return M1_res, M2_res
def _match_residue_maths(self, M2, sec): ''' Does the maths for match_residue. The reason for this additional step is that sometimes if the numbering is a bit off between the different proteins (i.e. a shift in the initial starting residues) and the protein is a homodimer, we can end up only adding the second monomer unit, and sometimes add it twice. Therefore we can compare two runs of this for an answer. :param M2: The second bb.Molecule() to compare with :param sec: Number of consecutive amino acids in a row that must match before resid's are recorded ''' # Get residue names / unique IDs M1_reslist = self.data["resname"][self.data["name"] == 'CA'].values M2_reslist = M2.data["resname"][M2.data["name"] == 'CA'].values M1_resid = self.data["resid"][self.data["name"] == 'CA'].values M2_resid = M2.data["resid"][M2.data["name"] == 'CA'].values # Remove C or N prefixes for cnt, val in enumerate(M1_reslist): if len(val) == 4: M1_reslist[cnt] = val[1:] else: continue for cnt, val in enumerate(M2_reslist): if len(val) == 4: M2_reslist[cnt] = val[1:] else: continue # Rename residues temporararily so they match better M1_reslist[np.logical_or(np.logical_or(M1_reslist == 'HIE', M1_reslist == 'HIP'), M1_reslist == 'HID')] = 'HIS' M2_reslist[np.logical_or(np.logical_or(M2_reslist == 'HIE', M2_reslist == 'HIP'), M2_reslist == 'HID')] = 'HIS' M1_reskeep = [] M2_reskeep = [] M2_cnt = 0 M1_cnt = 0 while M1_cnt < len(M1_reslist): # Initial check to see if we have a run of good matches (more than coincidence) if np.all(M1_reslist[M1_cnt:(M1_cnt + sec)] == M2_reslist[M2_cnt:(M2_cnt + sec)]): while M1_reslist[M1_cnt] == M2_reslist[M2_cnt]: M1_reskeep.append(M1_resid[M1_cnt]) M2_reskeep.append(M2_resid[M2_cnt]) M2_cnt += 1 M1_cnt += 1 # Break if we reach the maximum array length limit if M1_cnt == len(M1_reslist) or M2_cnt == len(M2_reslist): break # Check if we conicidently had the correct corresponding resnames if len(M1_reskeep) > 2: if M1_reskeep[-1] - M1_reskeep[-2] != 1 and M2_reskeep[-1] - M2_reskeep[-2] == 1: M1_reskeep = M1_reskeep[:-1] M2_reskeep = M2_reskeep[:-1] break elif M1_reskeep[-1] - M1_reskeep[-2] == 1 and M2_reskeep[-1] - M2_reskeep[-2] != 1: M1_reskeep = M1_reskeep[:-1] M2_reskeep = M2_reskeep[:-1] break else: continue # Elsewise move forward in count on second structure else: M2_cnt += 1 # Break if M1 and M2 have reached their ends, restart if only M2 has if M1_cnt == len(M1_reslist): #and M2_cnt == len(M2_reslist): break # break if the length of residues in M2 we are counting to now are longer than # the possible max number of saved residues in M2 elif len(M2_reskeep) >= len(M2_reslist): break # Need case so we don't recount a chain in the event of a homodimer elif M2_cnt >= len(M2_reslist): M1_cnt += 1 M2_cnt = len(M1_reskeep) else: continue return M1_reskeep, M2_reskeep
[docs] def pdb2pqr(self, ff=""): ''' Parses data from the pdb input into a pqr format. This uses the panda dataframe with the information regarding atom indexes, types etc. in the self.data files. It outputs a panda dataframe with the pqr equivilent information. It requires a datafile forcefield input. The default is the amber14sb forcefield file held within the classes/ folder. :param ff: name of forcefield text file input that needs to be read to read charges / vdw radii. ''' _, intervals = self.guess_chain_split() # patch naming of C-termini for i in intervals[1:]: idxs = self.same_residue(i-1, get_index=True)[1] names = self.data.loc[idxs, ["name"]].values if np.any(names == "OC1") or np.any(names == "OXT"): resname = self.data.loc[idxs[0], ["resname"]].values[0] newresnames = np.array(["C"+resname]*len(idxs)) self.data.loc[idxs, ["resname"]] = newresnames # patch naming of N-termini for i in intervals[0:-1]: idxs = self.same_residue(i, get_index=True)[1] names = self.data.loc[idxs, ["name"]].values if np.any(names == "H1") and np.any(names == "H2"): resname = self.data.loc[idxs[0], ["resname"]].values[0] newresnames = np.array(["N"+resname]*len(idxs)) self.data.loc[idxs, ["resname"]] = newresnames HIP = np.array(["HIP"] * 18) # create numpy array structures to possibly reassign later HIE = np.array(["HIE"] * 17) # create numpy array structures to possibly reassign later HID = np.array(["HID"] * 17) # create numpy array structures to possibly reassign later NHIP = np.array(["NHIP"] * 20) NHIE = np.array(["NHIE"] * 19) NHID = np.array(["NHID"] * 19) CHIP = np.array(["CHIP"] * 20) CHIE = np.array(["CHIE"] * 18) CHID = np.array(["CHID"] * 19) start_chain = self.data["resid"].iloc[0] # This is in case we get 1 or 2 as the first chain ID start end_chain = self.data["resid"].iloc[-1] # We don't know the end chain number so we find it here start_res = self.data["resname"].iloc[0] end_res = self.data["resname"].iloc[-1] # Need to check if first residue is actually an N-termini residue, and if so, reassign resnames if necessary if (self.data["name"].iloc[0:27] == 'H1').any() and (self.data["name"].iloc[0:27] == 'H2').any() and (self.data["name"].iloc[0:27] == 'H3').any() and self.data["resname"][0][0] != 'N': print('Found N-Termini, reassigning first resname to match the forcefield') start_index = self.data.index[self.data["resid"] == start_chain] for N in start_index: self.data["resname"].iloc[N] = 'N' + start_res # First chain needs to be prefixed with N-termini resname if len(ff) == 0: #"amber14sb.dat" folder = os.path.dirname(os.path.realpath(__file__)) ff = "%s/amber14sb.dat" % folder if os.path.isfile(ff) != 1: raise Exception("ERROR: %s not found!" % ff) ff = np.loadtxt(ff, usecols=(0,1,2,3,4), dtype=str) cols = ['resname', 'name', 'charge', 'radius', 'atomtype'] # where radius is the VdW radius in the amber file idx = np.arange(len(ff)) pqr_data = pd.DataFrame(ff, index=idx, columns=cols) charges = [] radius = [] atomtypes = [] # Need to check whether it matches HIE, HID or HIP depending on what protons are present and where his_check = self.data["resname"] == 'HIS' # Check if we need to do following calculation nhis_check = self.data["resname"] == 'NHIS' # Check for N termini HIS chis_check = self.data["resname"] == 'CHIS' if np.sum(his_check) != 0 or np.sum(nhis_check) != 0 or np.sum(chis_check) != 0: print("WARNING: found residue with name HIS, checking to see what protonation state it is in and reassigning to HIP, HIE or HID.\nYou should check HIS in your pdb file is right to be sure!") for ix in range(len(self.data["resname"])): H_length = 17 # Set this as it is more common, and also covers the basis to capture HD1 or HE2 later if necessary (as C and O tend to be last a # N is always the first atom (use that as basis) if self.data["name"][ix] == 'N' and self.data["resname"][ix] == 'HIS': if (self.data["name"][ix:(ix+H_length)] == 'HE2').any() and (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): # If the residue contains HE2 and HD1, it is a HIP residue H_length = 18 # number of atoms in histdine (HIP) self.data.loc[ix:(ix+H_length-1), "resname"] = HIP elif (self.data["name"][ix:(ix+H_length)] == 'HE2').any(): self.data.loc[ix:(ix+H_length-1), "resname"] = HIE elif (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): self.data.loc[ix:(ix+H_length-1), "resname"] = HID elif self.data["name"][ix] == 'N' and self.data["resname"][ix] == 'NHIS': H_length = 19 if (self.data["name"][ix:(ix+H_length)] == 'HE2').any() and (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): # If the residue contains HE2 and HD1, it is a HIP residue H_length = 20 # number of atoms in histdine (HIP) self.data.loc[ix:(ix+H_length-1), "resname"] = NHIP elif (self.data["name"][ix:(ix+H_length)] == 'HE2').any(): self.data.loc[ix:(ix+H_length-1), "resname"] = NHIE elif (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): self.data.loc[ix:(ix+H_length-1), "resname"] = NHID elif self.data["name"][ix] == 'N' and self.data["resname"][ix] == 'CHIS': H_length = 19 if (self.data["name"][ix:(ix+H_length)] == 'HE2').any() and (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): # If the residue contains HE2 and HD1, it is a HIP residue H_length = 20 # number of atoms in histdine (HIP) self.data.loc[ix:(ix+H_length-1), "resname"] = CHIP elif (self.data["name"][ix:(ix+H_length)] == 'HE2').any(): H_length = 18 self.data.loc[ix:(ix+H_length-1), "resname"] = CHIE elif (self.data["name"][ix:(ix+H_length)] == 'HD1').any(): self.data.loc[ix:(ix+H_length-1), "resname"] = CHID # Move through each line in the pdb.data file and find the corresponding charge / vdw radius as supplied by the forcefield for i, resnames in enumerate(self.data["resname"]): values_res = pqr_data["resname"] == resnames values_name = pqr_data["name"] == self.data["name"][i] values = np.logical_and(values_res, values_name) value_loc = pqr_data[values] if len(value_loc) == 0: print(value_loc, resnames, self.data["name"][i], self.data["resname"][i], self.data["resid"].iloc[i], self.data["index"].iloc[i]) raise Exception("ERROR: The atom names in your PDB file do not match the PQR file") else: charges.append(float(value_loc.iloc[0]["charge"])) radius.append(float(value_loc.iloc[0]["radius"])) atomtypes.append(value_loc.iloc[0]["atomtype"]) # Drop the beta factor / occupancy data to be replaced with charge / vdw radius numbers pqr = self.data.drop(['atomtype', 'radius', 'charge'], axis=1) # remove obselete data pqr['atomtype'] = atomtypes # Replace with Amber derived data for each atom pqr['radius'] = radius pqr['charge'] = charges print("Conversion Complete") return pqr
[docs] def write_pqr(self, outname, conformations=[], index=[]): ''' overload superclass method for writing (multi)pqr. :param outname: name of pqr file to be generated. :param index: indices of atoms to write to file. If empty, all atoms are returned. Index values obtaineable with a call like: index=molecule.atomselect("A", [1, 2, 3], "CA", True)[1] :param conformations: list of conformation indices to write to file. By default, a multipdb with all conformations will be produced. ''' # store current frame, so it will be reestablished after file output is # complete currentbkp = self.current # if a subset of all available frames is requested to be written, # select them first if len(conformations) == 0: frames = range(0, len(self.coordinates), 1) else: if np.max(conformations) < len(self.coordinates): frames = conformations else: raise Exception("ERROR: requested coordinate index %s, but only %s are available" %(np.max(conformations), len(self.coordinates))) # Get our PQR database style pqr = self.pdb2pqr() f_out = open(outname, "w") for f in frames: # get all informations from PDB (for current conformation) in a list self.set_current(f) d = self.get_pdb_data(index) # Get our # Build our hexidecimal array if num. of atoms > 99999 idx_val = np.arange(1, len(d) + 1, 1) if len(idx_val) > 99999: vhex = np.vectorize(hex) idx_val = vhex(idx_val) # convert index values to hexidecimal idx_val = [num[2:] for num in idx_val] # remove 0x at start of hexidecimal number for i in range(0, len(d), 1): # create and write PDB line if d[i][2][0].isdigit(): L = '%-6s%5s %-5s%-4s%1s%4s %8.3f%8.3f%8.3f%7.4f%7.4f %2s\n' % (d[i][0], idx_val[i], d[i][2], d[i][3], d[i][4], d[i][5], float(d[i][6]), float(d[i][7]), float(d[i][8]), float(pqr.iloc[i]["charge"]), float(pqr.iloc[i]["radius"]), d[i][11]) else: L = '%-6s%5s %-4s%-4s%1s%4s %8.3f%8.3f%8.3f%7.4f%7.4f %2s\n' % (d[i][0], idx_val[i], d[i][2], d[i][3], d[i][4], d[i][5], float(d[i][6]), float(d[i][7]), float(d[i][8]), float(pqr.iloc[i]["charge"]), float(pqr.iloc[i]["radius"]), d[i][11]) f_out.write(L) f_out.write("END\n") f_out.close() self.set_current(currentbkp) return
[docs] def get_dipole_map(self, orig, pqr, time_start = 0, time_end = 2,resolution = 1., vox_in_window = 3., write_dipole_map = True, fname = "dipole_map.tcl"): ''' Method for generating dipole maps to be used for electron density map generation. Also prints a dipole map as a result (and if desired). It calls a cython code in lib. :param orig: Origin points for voxel grid :param pqr: PQR file for self. Can be generated by calling pdb2pqr above :param time_start: First frame to parse in multipdb :param time_end: Last frame to parse in multipdb :param resolution: Desired resolution of voxel :param vox_in_window: Amount of surrounding space to contribute to local dipole. vox_in_window * resolution gives window size (in Ang.) :param write_dipole_map: Write a dipole map in TCL format to be read in via VMD. :param fname: Name of desired dipole map to be written ''' charges = pqr["charge"].values[:] crd = self.coordinates[time_start:time_end] # cut out coordinates we're interested in time_end -= time_start # shift to compensate for cutting the coordinates earlier time_start = 0 dipole_map = e_density.c_get_dipole_map(crd = crd, orig = orig, charges = charges, time_start = time_start, time_end = time_end,resolution = resolution, vox_in_window = vox_in_window, write_dipole_map = write_dipole_map, fname = fname) return dipole_map
[docs] def get_dipole_density(self, dipole_map, orig, min_val, V, outname, vox_in_window = 3., eqn = 'gauss', T = 310.15, P = 101. * 10**3, epsilonE = 54., resolution = 1.): ''' Method to generate an electron density map based on a voxel grid of dipole vectors :param dipole_map: The dipole map input. Can be generated with get_dipole_map above :param orig: Origin points for voxel grid :param min_val: Minimum coordinates of edge points for the voxel grid (i.e. a single x, y, z point defining the start point of the grid to match with the multipdb) :param V: Volume of a voxel (can be found by resolution**3, but left blank in case later version institute a sphere) :param outname: Name of electron density map file produced :param vox_in_window: Amount of surrounding space to contribute to local dipole. vox_in_window * resolution gives window size (in Ang.) :param eqn: Equation mode to model the electron density :param T: Temperature of MD :param P: Pressure of MD :param epsilonE: Continuum dielectric surrounding the protein :param resolution: Desired resolution of voxel ''' dummy = e_density.c_get_dipole_density(dipole_map = dipole_map, orig = orig, min_val = min_val, V = V, outname = outname, vox_in_window = vox_in_window, eqn = eqn, T = T, P = P, epsilonE = epsilonE, resolution = resolution)