Skip to content

Commit

Permalink
Add general analysis for frag
Browse files Browse the repository at this point in the history
  • Loading branch information
martimunicoy committed Mar 25, 2021
1 parent 862a923 commit 0c665e9
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 55 deletions.
9 changes: 7 additions & 2 deletions pele_platform/Adaptive/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,12 @@ def run_adaptive(args):
analysis_folder = os.path.join(parameters.pele_dir, "results")

analysis = Analysis.from_parameters(parameters)
analysis.generate(analysis_folder,
clustering_type=parameters.clustering_method.lower())
analysis.generate(
analysis_folder,
clustering_type=parameters.clustering_method.lower(),
bandwidth=parameters.bandwidth,
analysis_nclust=parameters.analysis_nclust,
max_top_clusters=parameters.max_top_clusters,
min_population=parameters.min_population)

return parameters
33 changes: 33 additions & 0 deletions pele_platform/Frag/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,39 @@ def _analysis(self):
atomCoords=self.parameters.analysis_to_point,
pattern=os.path.basename(self.parameters.system))

# TODO create a list of the libraries defined in the current input.yaml
from pele_platform.analysis import Analysis
from glob import glob

sim_directories = glob(os.path.splitext(self.parameters.system)[0]
+ '_processed_*' +
self.parameters.frag_core_atom + '*')

for sim_directory in sim_directories:
simulation_output = os.path.join(sim_directory, 'sampling_result')
analysis_folder = os.path.join(sim_directory, "results")

analysis = Analysis(resname='GRW',
chain=self.parameters.chain,
simulation_output=simulation_output,
be_column=self.parameters.be_column,
limit_column=self.parameters.limit_column,
traj=self.parameters.traj_name,
report=self.parameters.report_name,
skip_initial_structures=not self.parameters.test,
kde=self.parameters.kde,
kde_structs=self.parameters.kde_structs,
topology=self.parameters.topology,
cpus=self.parameters.cpus)

analysis.generate(
analysis_folder,
clustering_type=self.parameters.clustering_method.lower(),
bandwidth=self.parameters.bandwidth,
analysis_nclust=self.parameters.analysis_nclust,
max_top_clusters=self.parameters.max_top_clusters,
min_population=self.parameters.min_population)

def _clean_up(self, fragment_files):
for file in fragment_files:
if os.path.isfile(file):
Expand Down
110 changes: 58 additions & 52 deletions pele_platform/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module manages the analysis toolkit of the platform.
"""


__all__ = ["Analysis"]


Expand All @@ -18,9 +19,7 @@ class Analysis(object):
def __init__(self, resname, chain, simulation_output,
be_column=4, limit_column=None, traj="trajectory.pdb",
report=None, skip_initial_structures=True, kde=False,
kde_structs=1000, clustering_method="meanshift",
bandwidth=2.5, analysis_nclust=10, topology=None,
cpus=1, max_top_clusters=8, min_population=0.01):
kde_structs=1000, topology=None, cpus=1):
"""
It initializes an Analysis instance which it depends on
the general Parameters class of the PELE Platform.
Expand All @@ -37,7 +36,8 @@ def __init__(self, resname, chain, simulation_output,
be_column : int
Column with energy metric, default 4.
limit_column : int
Integer specifying the first column from which the meaningful metrics start, e.g. SASA or RMSD.
Integer specifying the first column from which the meaningful
metrics start, e.g. SASA or RMSD.
traj : str
Trajectory name defaults to "trajectory.pdb",
but you should use "trajectory.xtc" if using XTC format.
Expand All @@ -53,25 +53,11 @@ def __init__(self, resname, chain, simulation_output,
kde_structs : int
Maximum number of structures to consider for the KDE plot.
Default is 1000
clustering_method : str
Clustering method to be used. On of ['gaussianmixture',
'meanshift', 'hdbscan']. Default is 'meanshift'
bandwidth : float
Bandwidth for the mean shift and HDBSCAN clustering. Default is
2.5
analysis_nclust : int
Number of clusters to create when using the Gaussian mixture
model. Default is 10
topology : str
Path to the topology file, if using XTC trajectories. Default
is None
cpus: int
Number of CPUs to use. Default is 1
max_top_clusters : int
Maximum number of clusters to return. Default is 8
min_population : float
The minimum amount of structures in a cluster, takes a value
between 0 and 1. Default is 0.01 (i.e. 1%)
"""
from pele_platform.analysis import DataHandler

Expand All @@ -87,13 +73,8 @@ def __init__(self, resname, chain, simulation_output,
self.traj = traj
self.report = report if report else self._REPORT
self.skip_initial_structures = skip_initial_structures
self.clustering_method = clustering_method
self.bandwidth = bandwidth
self.analysis_nclust = analysis_nclust
self.topology = topology
self.cpus = cpus
self.max_top_clusters = max_top_clusters
self.min_population = min_population

self._data_handler = DataHandler(
sim_path=self.output,
Expand Down Expand Up @@ -134,13 +115,8 @@ def from_parameters(cls, parameters):
skip_initial_structures=not parameters.test,
kde=parameters.kde,
kde_structs=parameters.kde_structs,
clustering_method=parameters.clustering_method,
bandwidth=parameters.bandwidth,
analysis_nclust=parameters.analysis_nclust,
topology=parameters.topology,
cpus=parameters.cpus,
max_top_clusters=parameters.max_top_clusters,
min_population=parameters.min_population)
cpus=parameters.cpus)

return analysis

Expand Down Expand Up @@ -177,8 +153,7 @@ def get_dataframe(self, filter=False, threshold=None):
"""
if filter:
return self._data_handler.remove_outliers_from_dataframe(
self._dataframe, threshold
)
self._dataframe, threshold)
else:
return self._dataframe

Expand All @@ -204,7 +179,9 @@ def dataframe_to_csv(self, path, filter=False, threshold=None):
# Save it as a csv file
dataframe.to_csv(path, index=False)

def generate(self, path, clustering_type):
def generate(self, path, clustering_type='meanshift',
bandwidth=2.5, analysis_nclust=10,
max_top_clusters=8, min_population=0.01):
"""
It runs the full analysis workflow (plots, top poses and clusters)
and saves the results in the supplied path.
Expand All @@ -215,7 +192,19 @@ def generate(self, path, clustering_type):
The path where the analysis results will be saved
clustering_type : str
The clustering method that will be used to generate the
clusters
clusters. One of ['gaussianmixture', 'meanshift', 'hdbscan'].
Default is 'meanshift'
bandwidth : float
Bandwidth for the mean shift and HDBSCAN clustering. Default is
2.5
analysis_nclust : int
Number of clusters to create when using the Gaussian mixture
model. Default is 10
max_top_clusters : int
Maximum number of clusters to return. Default is 8
min_population : float
The minimum amount of structures in a cluster, takes a value
between 0 and 1. Default is 0.01 (i.e. 1%)
"""
import os

Expand All @@ -242,30 +231,26 @@ def generate(self, path, clustering_type):
# Generate analysis results
self.generate_plots(plots_folder)
best_metrics = self.generate_top_poses(top_poses_folder)
self.generate_clusters(clusters_folder, clustering_type)
self.generate_clusters(clusters_folder, clustering_type,
bandwidth, analysis_nclust,
max_top_clusters, min_population)

self.generate_report(plots_folder, top_poses_folder,
clusters_folder, best_metrics, report_file)

def generate_plots(self, path, existing_dataframe=None, colors=None):
def generate_plots(self, path):
"""
It generates the plots.
Parameters
----------
path : str
The path where the plots will be saved
existing_dataframe : pandas.Dataframe
Dataframe with data to plot.
colors : list
List of cluster IDs for colour mapping.
"""
from pele_platform.analysis import Plotter

# Get dataframe
if existing_dataframe is None:
dataframe = self.get_dataframe(filter=True)
else:
dataframe = existing_dataframe
# Get dataframe, filtering highest 2% energies out
dataframe = self.get_dataframe(filter=True)

# Initialize plotter
plotter = Plotter(dataframe)
Expand Down Expand Up @@ -344,7 +329,9 @@ def generate_top_poses(self, path, n_poses=100):

return best_metrics

def generate_clusters(self, path, clustering_type):
def generate_clusters(self, path, clustering_type,
bandwidth=2.5, analysis_nclust=10,
max_top_clusters=8, min_population=0.01):
"""
It generates the structural clustering of ligand poses.
Expand All @@ -355,14 +342,27 @@ def generate_clusters(self, path, clustering_type):
clustering_type : str
The clustering method that will be used to generate the
clusters
bandwidth : float
Bandwidth for the mean shift and HDBSCAN clustering. Default is
2.5
analysis_nclust : int
Number of clusters to create when using the Gaussian mixture
model. Default is 10
max_top_clusters : int
Maximum number of clusters to return. Default is 8
min_population : float
The minimum amount of structures in a cluster, takes a value
between 0 and 1. Default is 0.01 (i.e. 1%)
"""
import os
from pele_platform.Utilities.Helpers.helpers import check_output_folder

check_output_folder(path)

# Get clustering object
clustering, max_coordinates = self._get_clustering(clustering_type)
clustering, max_coordinates = self._get_clustering(clustering_type,
bandwidth,
analysis_nclust)

# Extract coordinates
coordinates, dataframe = self._extract_coordinates(max_coordinates)
Expand Down Expand Up @@ -395,8 +395,8 @@ def generate_clusters(self, path, clustering_type):
print(f"Retrieve best cluster poses")
cluster_subset, cluster_summary = self._select_top_clusters(
clusters, cluster_summary,
max_clusters_to_select=self.max_top_clusters,
min_population_to_select=self.min_population)
max_clusters_to_select=max_top_clusters,
min_population_to_select=min_population)

# Save cluster summary to file with information about selected labels
cluster_summary.to_csv(os.path.join(path, "info.csv"), index=False)
Expand Down Expand Up @@ -440,7 +440,7 @@ def generate_report(self, plots_path, poses_path, clusters_path,

print("PDF summary report successfully written to: {}".format(report))

def _get_clustering(self, clustering_type):
def _get_clustering(self, clustering_type, bandwidth, analysis_nclust):
"""
It returns the clustering object according to the supplied
clustering type.
Expand All @@ -458,19 +458,25 @@ def _get_clustering(self, clustering_type):
max_coordinates : int
The maximum number of coordinates to extract from the
residue
bandwidth : float
Bandwidth for the mean shift and HDBSCAN clustering. Default is
2.5
analysis_nclust : int
Number of clusters to create when using the Gaussian mixture
model. Default is 10
"""
from pele_platform.analysis import (GaussianMixtureClustering,
HDBSCANClustering,
MeanShiftClustering)

if clustering_type.lower() == "gaussianmixture":
clustering = GaussianMixtureClustering(self.analysis_nclust)
clustering = GaussianMixtureClustering(analysis_nclust)
max_coordinates = 10
elif clustering_type.lower() == "hdbscan":
clustering = HDBSCANClustering(self.bandwidth)
clustering = HDBSCANClustering(bandwidth)
max_coordinates = 10
elif clustering_type.lower() == "meanshift":
clustering = MeanShiftClustering(self.bandwidth)
clustering = MeanShiftClustering(bandwidth)
max_coordinates = 5
else:
raise ValueError("Invalid clustering type: " +
Expand Down
13 changes: 12 additions & 1 deletion pele_platform/analysis/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,19 @@ def get_reports_dataframe(self, from_scratch=False):
epochs = [os.path.basename(path) for path in epoch_dirs
if os.path.basename(path).isdigit()]

# Sort epochs by number
epochs = sorted(epochs, key=int)

# Tweak to read a directory from standard PELE (not coming
# from adaptive)
if len(epochs) == 0:
report_dirs = glob.glob(os.path.join(self._sim_path,
report_prefix + '_[0-9]*'))
if len(report_dirs) > 0:
epochs = ['']

dataframe_lists = []
for adaptive_epoch in sorted(epochs, key=int):
for adaptive_epoch in epochs:
folder = os.path.join(self._sim_path, str(adaptive_epoch))
report_dirs = glob.glob(os.path.join(folder,
report_prefix + '_[0-9]*'))
Expand Down

0 comments on commit 0c665e9

Please sign in to comment.