Source code for density

# Copyright (c) 2014-2017 Matteo Degiacomi
#
# BiobOx is free software ;
# you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation ;
# either version 2 of the License, or (at your option) any later version.
# BiobOx is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY ;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with BiobOx ;
# if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
#
# Author : Matteo Degiacomi, matteothomas.degiacomi@gmail.com

import os

import scipy.ndimage.filters
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

from biobox.classes.structure import Structure

[docs]class Density(Structure): ''' Subclass of :func:`Structure <structure.Structure>`, allows importing density map, and transform them in a PDB file containing a collection of spheres placed on the map's high density regions. ''' def __init__(self): ''' A density map is fully described by the following attributes, stored in the self.properties dictionary: :param density: density map :param delta: scaling factor for voxels (default is [1, 1, 1] Angstrom) :param size: dimensions in voxels :param origin: bottom-left-front corner of the cube :param radius: radius of points composing the density map :param format: format name (only dx supported at the moment) ''' super(Density, self).__init__() self._reset_info() def _reset_info(self, r=1.9): ''' reset all properties related to a density map (clean) ''' self.properties['density'] = np.array([]) # density map self.properties['radius'] = r # scaling factor for voxels (default is [1, 1, 1] Angstrom) self.properties['delta'] = np.array([]) self.properties['size'] = np.array([]) # dimensions in voxels # bottom-left-front corner of the cube self.properties['origin'] = np.array([]) self.properties['format'] = "" # file format self.properties['scan'] = np.array([]) self.properties['filename'] = "" self.clear()
[docs] def return_density_map(self): ''' :returns: density map as 3D numpy array ''' return self.properties['density']
[docs] def import_map(self, filename, fileformat='dx'): ''' Import density map and fill up the points and properties data structures. :param filename: name of density file to load :param fileformat: at the moment supports dx, ccp4, mrc and imod ''' if not os.path.exists(filename): raise Exception("%s not found!" % filename) # call format-specific loading functions. # function should fill up all required properties in _reset_info(), and # load the map as a 3D array, containing intensity values. try: if fileformat == 'dx': self._import_dx(filename) elif fileformat == 'ccp4': self._import_mrc(filename, 'ccp4') elif fileformat == 'mrc': self._import_mrc(filename, 'mrc') elif fileformat == 'imod': self._import_mrc(filename, 'imod') else: raise Exception("sorry, format %s is not supported" % fileformat) except Exception as e: self._reset_info() Exception("ERROR: %s" % e) # if any error went undetected during loading (missing information), # data structures may be inconsistent. Call cleaning procedure! if len(self.properties['density']) == 0: print("density map could not be correctly loaded!") self._reset_info(self) elif len(self.properties['size']) == 0: print("density map information missing!") self._reset_info(self) elif len(self.properties['origin']) == 0: print("map origin information missing!") self._reset_info(self) elif len(self.properties['delta']) == 0: print("voxel size information missing!") self._reset_info(self) # if all required information is present, place points instead of # voxels else: try: self.place_points() except Exception as e: pass self.properties['format'] = format self.properties['filename'] = filename self.properties["sigma"] = np.std(self.properties['density'])
[docs] def import_numpy(self, data, origin=[0, 0, 0], delta=np.identity(3)): ''' import a numpy 3D array to allow manipulation as a density map :param data: numpy 3D array :param origin: coordinates of bottom left corner of the map :param delta: voxels' shape (default is a cubic voxel of 1 Angstrom-long sides). ''' if len(data.shape) != 3: raise Exception("ERROR: a 3D numpy array is expected") self.properties['density'] = data self.properties['origin'] = np.array(origin) self.properties['size'] = np.array(data.shape) self.properties['delta'] = delta # sphere size corresponding to the volume of one voxel voxel_volume = self.properties['delta'][0, 0] * self.properties['delta'][1, 1] * self.properties['delta'][2, 2] self.properties['radius'] = (voxel_volume * 3 / (4 * np.pi))**(1 / 3.0)
[docs] def get_oversampled_points(self, sigma=0): ''' return points obtained by oversampling the map (doule points on every axis) :param sigma: place points only on voxels having intensity greater than threshold :returns: points 3D points placed on voxels having value higher than threshold :returns: radius radius of produced points ''' thresh = self.get_thresh_from_sigma(sigma) oversampled_data = np.zeros((self.properties['size'][0] * 2, self.properties['size'][1] * 2, self.properties['size'][2] * 2)) for x in range(1, self.properties['size'][0] - 1, 1): for y in range(1, self.properties['size'][1] - 1, 1): for z in range(1, self.properties['size'][2] - 1, 1): if self.properties['density'][x, y, z] > thresh: oversampled_data[x * 2, y * 2, z * 2] = self.properties['density'][x, y, z] oversampled_data[x * 2 + 1, y * 2, z * 2] = self.properties['density'][x, y, z] oversampled_data[x * 2, y * 2 + 1, z * 2] = self.properties['density'][x, y, z] oversampled_data[x * 2, y * 2, z * 2 + 1] = self.properties['density'][x, y, z] oversampled_data[x * 2 - 1, y * 2, z * 2] = self.properties['density'][x, y, z] oversampled_data[x * 2, y * 2 - 1, z * 2] = self.properties['density'][x, y, z] oversampled_data[x * 2, y * 2, z * 2 - 1] = self.properties['density'][x, y, z] delta = self.properties['delta'] / 2 radius = self.properties['radius'] * 2 / 3 # define scaling to shrink everything by a size equal to spheres radius size = np.max(np.transpose(np.where(oversampled_data > thresh)), axis=0) - np.min(np.transpose(np.where(oversampled_data > thresh)), axis=0) scaled_size = np.max(np.transpose(np.where(oversampled_data > thresh)) * np.diag(delta), axis=0) - np.min(np.transpose(np.where(oversampled_data > thresh)) * np.diag(delta), axis=0) scaling = 8.0 / 3.0 scale_x = (delta[0, 0] - 1.0) / (scaled_size[0] - size[0]) * (scaled_size[0] - radius * scaling) scale_y = (delta[1, 1] - 1.0) / (scaled_size[1] - size[1]) * (scaled_size[1] - radius * scaling) scale_z = (delta[2, 2] - 1.0) / (scaled_size[2] - size[2]) * (scaled_size[2] - radius * scaling) # create structure data (ensemble of points and their center, and store # its geometric center) points = np.transpose(np.where(oversampled_data > thresh)) * np.array([scale_x, scale_y, scale_z]) + self.properties['origin'] + np.ones(3) * self.properties['radius'] * 2 / 3 return points, radius
[docs] def get_thresh_from_sigma(self, val): ''' convert cutoff value from sigma multiples into actual threshold :param val: sigma scaling :returns: cutoff ''' return self.properties["sigma"] * val
[docs] def get_sigma_from_thresh(self, t): ''' convert cutoff value from actual threshold to sigma multiple :param threshold value :returns sigma multiple ''' return t / float(self.properties["sigma"])
[docs] def place_points(self, sigma=0, noise_filter=0.01): ''' given density information, place points on every voxel above a given threshold. :param sigma: intensity threshold value. :param noise_filter: launch DBSCAN clustering algorithm to detect connected regions in density map. Regions representing less than noise_filter of the total will be removed. This is a ratio, value should be between 0 and 1. ''' thresh = self.get_thresh_from_sigma(sigma) if not np.any(self.properties['density'] > thresh): raise IOError("selected threshold leads to empty point ensemble") if noise_filter >= 1 or noise_filter < 0: raise IOError("noise_filter should be between 0 and 1") # define scaling to shrink everything by a size equal to spheres radius size = np.max(np.transpose(np.where(self.properties['density'] > thresh)), axis=0) - np.min(np.transpose(np.where(self.properties['density'] > thresh)), axis=0) scaled_size = np.max(np.transpose(np.where(self.properties['density'] > thresh)) * np.diag(self.properties['delta']), axis=0)-np.min(np.transpose(np.where(self.properties['density'] > thresh)) * np.diag(self.properties['delta']), axis=0) if self.properties['delta'][0, 0] != 1: scale_x = (self.properties['delta'][0, 0] - 1.0) / (scaled_size[0] - size[0]) * (scaled_size[0] - self.properties['radius']) else: scale_x = 1.0 if self.properties['delta'][1, 1] != 1: scale_y = (self.properties['delta'][1, 1] - 1.0) / (scaled_size[1] - size[1]) * (scaled_size[1] - self.properties['radius']) else: scale_y = 1.0 if self.properties['delta'][2, 2] != 1: scale_z = (self.properties['delta'][2, 2] - 1.0) / (scaled_size[2] - size[2]) * (scaled_size[2] - self.properties['radius']) else: scale_z = 1 # create structure data (ensemble of points and their center, and store # its geometric center) points = np.transpose(np.where(self.properties['density'] > thresh)) * np.array([scale_x, scale_y, scale_z]) + self.properties['origin'] + np.ones(3) * self.properties['radius'] # remove previous points arrangement, and create new one (necessary, # since the amount of points will change, and cannot therefore be # considered a new conformation) self.clear() # apply DBSCAN noise filter if noise_filter != 0: step = self.properties['delta'][0, 0] * np.sqrt(3) db = DBSCAN(eps=step, min_samples=10).fit(points) pts2 = [] for i in np.unique(db.labels_): if np.sum(db.labels_ == i) / float(len(points)) > 0.01 and i != -1: if len(pts2) == 0: pts2 = points[db.labels_ == i] else: pts2 = np.concatenate((pts2, points[db.labels_ == i])) self.add_xyz(pts2) else: self.add_xyz(points) #update radii list (useful for instance for CCS calculation) idx = np.arange(len(self.points)) self.data = pd.DataFrame(idx, index=idx, columns=["radius"])
# self.get_center()
[docs] def scan_threshold(self, mass, density=0.782878356, sampling_points=1000): ''' if mass and density of object are known, filter the map on a linear scale of threshold values, and compare the obtained mass to the experimental one. .. note:: in proteins, an average value of 1.3 g/cm^3 (0.782878356 Da/A^3) can be assumed. Alternatively, the relation density=1.410+0.145*exp(-mass(kDa)/13) can be used. .. note:: 1 Da/A^3=0.602214120 g/cm^3 :param mass: target mass in Da :param density: target density in Da/A^3 :param sampling_points: number of measures to perform between min and max intensity in density map :returns: array reporting tested values and error on mass ([threshold, model_mass-target_mass]) ''' low = self.get_sigma_from_thresh(np.min(self.properties['density'])) high = self.get_sigma_from_thresh(np.max(self.properties['density'])) result = [] for thresh in np.linspace(low, high, num=sampling_points): try: self.place_points(thresh) vol = self.get_volume() except Exception as e: vol = 0.0 print("threshold=%s, error=%s" % (thresh, vol * density - mass)) result.append([thresh, vol * density - mass]) r = np.array(result) bestthresh = r[np.argmin(np.abs(r[:, 1]) - mass), 0] try: self.place_points(bestthresh) except Exception as ex: pass return r
[docs] def find_data_from_sigma(self, sigma, exact=True, append=False, noise_filter=0.01): ''' map experimental data to given threshold :param sigma: density threshold :param noise_filter: launch DBSCAN clustering algorithm to detect connected regions in density map. Regions representing less than noise_filter of the total will be removed. This is a ratio, value should be between 0 and 1. ''' thresh = self.get_sigma_from_thresh(sigma) if exact: import biobox as bb try: self.place_points(thresh, noise_filter) vol = self.get_volume() ccs = bb.ccs(self, scale=False) except Exception as ex: vol = 0 ccs = 0 res = np.array([thresh, vol, ccs]) if append: self.properties['scan'] = np.concatenate((self.properties['scan'], res)) return res else: return self.properties['scan'][ np.argmin(np.abs(self.properties['scan'][:, 0] - sigma))]
[docs] def find_data_from_volume(self, vol): ''' map experimental data to given volume :param vol: volume ''' return self.properties['scan'][ np.argmin(np.abs(self.properties['scan'][:, 1] - vol))]
[docs] def find_data_from_ccs(self, ccs): ''' map experimental data to given ccs :param ccs: target CCS (in A^2) ''' return self.properties['scan'][ np.argmin(np.abs(self.properties['scan'][:, 2] - ccs))]
[docs] def threshold_vol_ccs(self, low="", high="", sampling_points=1000, append=False, noise_filter=0.01): ''' return the volume to threshold to CCS relationship :param sampling_points: number of measures to perform between min and max intensity in density map :returns: array reporting tested values and error on mass ([threshold, model_mass-target_mass]) ''' import biobox as bb if low == "": low = self.get_sigma_from_thresh(np.min(self.properties['density'])) if high == "": high = self.get_sigma_from_thresh(np.max(self.properties['density'])) result = [] for thresh in np.linspace(low, high, num=sampling_points): try: self.place_points(thresh, noise_filter=noise_filter) print("placed!") vol = self.get_volume() ccs = bb.ccs(self, scale=False) except Exception as ex: vol = 0 ccs = 0 print("thresh: %s, vol=%s, ccs=%s (%s points)" % (thresh, vol, ccs, len(self.points))) result.append([thresh, vol, ccs]) r = np.array(result) if append: self.properties['scan'] = np.concatenate((self.properties['scan'], r)) else: self.properties['scan'] = r return r
[docs] def best_threshold(self, mass, density=0.782878356): ''' If mass and density of object are known, try to filter the map so that the mass is best matched. search for best threshold using bissection method. .. note:: in proteins, an average value of 1.3 g/cm^3 (0.782878356 Da/A^3) can be assumed. Alternatively, the relation density=1.410+0.145*exp(-mass(kDa)/13) can be used. .. note:: 1 Da/A^3=1.660538946 g/cm^3 .. note:: 1 g/cm^3=0.602214120 Da/A^3 :param mass: target mass in Da :param density: target density in Da/A^3 :returns: array reporting tested values and error on mass ([sigma, model_mass-target_mass]) ''' high = self.get_sigma_from_thresh(np.max(self.properties['density'])) low = self.get_sigma_from_thresh(np.min(self.properties['density'])) high_val = 1000000000 low_val = -1000000000 error = high_val result = [] while True: thresh = (high + low) / 2.0 try: self.place_points(thresh) vol = self.get_volume() except Exception as ex: vol = 0 mass_model = vol * density error = mass_model - mass result.append([thresh, error]) if error == high_val or error == low_val: if np.abs(high_val) < np.abs(low_val): thresh = high else: thresh = low break if error < 0: high = thresh high_val = error elif error > 0: low = thresh low_val = error else: break r = np.array(result) bestthresh = r[-1, 0] try: self.place_points(bestthresh) except Exception as ex: pass return r
[docs] def blur(self, dimension=5, sigma=0.5): ''' blur density applying a cubic gaussian kernel of given kernel dimension (cubic grid size). .. warning:: cannot be undone :param dimension: size of the kernel grid. :param sigma: standard deviation of gaussian kernel. ''' shape = (dimension, dimension, dimension) m, n, k = [(ss - 1.) / 2. for ss in shape] x_ = np.arange(-m, m + 1, 1).astype(int) y_ = np.arange(-n, n + 1, 1).astype(int) z_ = np.arange(-k, k + 1, 1).astype(int) x, y, z = np.meshgrid(x_, y_, z_) h = np.exp(-(x * x + y * y) / (2. * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 sumh = h.sum() if sumh != 0: h /= sumh dens = scipy.ndimage.filters.convolve(self.properties['density'], h, mode='constant') self.properties['density'] = dens self.properties["sigma"] = np.std(self.properties['density'])
[docs] def write_dx(self, fname='dens.dx'): ''' Write a density map in DX format :param fname: output file name ''' dens = self.properties['density'] origin = self.properties['origin'] delta = self.properties['delta'] fout = open(fname, "w") fout.write("# density generated with SBT\n#\n#\n#\n") fout.write("object 1 class gridpositions counts %s %s %s\n" % (dens.shape[0], dens.shape[1], dens.shape[2])) fout.write("origin %s %s %s\n"%(origin[0], origin[1], origin[2])) fout.write("delta %s %s %s\n"%(delta[0, 0], delta[0, 1], delta[0, 2])) fout.write("delta %s %s %s\n" % (delta[1, 0], delta[1, 1], delta[1, 2])) fout.write("delta %s %s %s\n" %(delta[2, 0], delta[2, 1], delta[2, 2])) fout.write("object 2 class gridconnections counts %s %s %s\n"%(dens.shape[0], dens.shape[1], dens.shape[2])) fout.write("object 3 class array type double rank 0 items %i data follows\n"%(dens.shape[0] * dens.shape[1] * dens.shape[2])) cnt = 0 for xpos in range(0, dens.shape[0], 1): for ypos in range(0, dens.shape[1], 1): for zpos in range(0, dens.shape[2], 1): fout.write("%s " % dens[xpos, ypos, zpos]) cnt += 1 if cnt % 3 == 0: fout.write("\n") fout.close()
[docs] def export_as_pdb(self, outname, step, threshold=0.1): ''' Write a pdb file with points where the density exceeds a threshold :param outname: output file name :param step: stepsize used to generate the density map :param threshold: density to be exceeded to generate a point in pdb ''' # @todo assign spheres beta factor to associated density value dens = self.properties['density'] origin = self.properties['origin'] fout = open(outname, 'w') cnt = 1 identifier = 'ATOM' # atom to be used to mimick density symbol = 'H' # element for atom print('exporting density greater than %s to pdb' % threshold) for xpos in range(0, dens.shape[0], 1): for ypos in range(0, dens.shape[1], 1): for zpos in range(0, dens.shape[2], 1): if dens[xpos, ypos, zpos] > threshold: x_coord = float(xpos / step) + origin[0] y_coord = float(ypos / step) + origin[1] z_coord = float(zpos / step) + origin[2] L = '%-6s%5s %-4s%-4s DIS %8.3f%8.3f%8.3f 1.00 0.00 \n'%(identifier, cnt, symbol, symbol, x_coord, y_coord, z_coord) fout.write(L) cnt += 1 fout.close()
def _import_dx(self, filename): ''' import density map and fill up the points and properties data structures. :param filename: name of dx file to load ''' try: fin = open(filename, "r") except Exception as e: raise Exception('opening of file %s failed!' % filename) d = [] dlt = [] for line in fin: w = line.split() if w[0] == "#": continue elif len(w) <= 3: #and np.array(list(w)).dtype == ('float'): try: w = np.array(w).astype(float) for i in range(len(w)): d.append(w[i]) except Exception as e: continue # get coordinates elif len(w) > 2 and w[0] == "object" and w[3] == "gridpositions": self.properties['size'] = np.array([w[-3], w[-2], w[-1]]).astype(int) # get scaling factor elif len(w) > 2 and w[0] == "delta": dlt.append(w[1:4]) # get position of origin elif len(w) > 2 and w[0] == "origin": self.properties['origin'] = np.array(w[1:4]).astype(float) # scaling factor with respect of unit cell voxels self.properties['delta'] = np.array(dlt).astype(float) try: self.properties['density'] = np.reshape( np.array(d).astype(float), (self.properties['size'][0], self.properties['size'][1], self.properties['size'][2])) except Exception as ex: raise Exception("reshaping of dx data failed! Dimensions and dataset size are inconsistent!") def _import_mrc(self, filename, fileformat): ''' import density map in MRC, CCP4 or IMOD format. MRC format here: www2.mrc-lmb.cam.ac.uk/image2000.html CCP4 format here: www.ccp4.ac.uk/html/maplib.html :param filename name of MRC or CCP4 file to load :param fileformat: can be mrc, imod or ccp4 ''' import biobox.classes.density_MRC as MRC try: [density, data] = MRC.read_density(filename, fileformat) self.properties['density'] = density self.properties['origin'] = np.array(data.origin) self.properties['size'] = np.array(density.shape) self.properties['delta'] = np.identity(3) * data.mrc_data.data_step # sphere size corresponding to the volume of one voxel voxel_volume = self.properties['delta'][0, 0] * self.properties['delta'][1, 1] * self.properties['delta'][2, 2] self.properties['radius'] = (voxel_volume * 3 / (4 * np.pi))**(1 / 3.0) except Exception as e: raise Exception("%s" % e)
[docs] def get_volume(self): ''' compute density map volume. This is done by counting the points, and multiplying that by voxels' volume. .. warning:: can be called only after :func:`place_points <density.Density.place_points>` has been called. .. warning:: supposes unskewed voxels. ''' return self.properties['delta'][0, 0] * self.properties['delta'][1, 1] * self.properties['delta'][2, 2] * len(self.points)
if __name__ == "__main__": import biobox as bb print("loading density...") D = bb.Density() D.import_map("..\\test\\EMD-1080.mrc", "mrc") print("placing points...") D.place_points(6, noise_filter=0) print("one point of CCS calculation...") D.threshold_vol_ccs(sampling_points=1, append=False, noise_filter=0)