Source code for pytfa.analysis.sampling

"""
.. module:: pytfa
   :platform: Unix, Windows
   :synopsis: Thermodynamics-based Flux Analysis

.. moduleauthor:: pyTFA team

Sampling wrappers for pytfa models


"""

import numpy as np
from sympy.core.singleton import S
from time import  time
from  cobra.sampling import OptGPSampler, ACHRSampler, HRSampler,\
                                            shared_np_array
from optlang.interface import OPTIMAL

[docs]class GeneralizedHRSampler(HRSampler):

    def __init__(self, model, thinning,  nproj=None, seed=None):
        """
        Adapted from cobra.flux_analysis.sampling.py
        _________________________________________

        Initialize a new sampler object.
        """
        # This currently has to be done to reset the solver basis which is
        # required to get deterministic warmup point generation
        # (in turn required for a working `seed` arg)
        HRSampler.__init__(self, model, thinning, seed=seed)
        
        if model.solver.is_integer:
            raise TypeError("sampling does not work with integer problems :(")
        self.model = model.copy()

        self.thinning = thinning

        if nproj is None:
            self.nproj = int(min(len(self.model.variables)**3, 1e6))
        else:
            self.nproj = nproj
        self.n_samples = 0
        self.retries = 0
        
        # Careful, double underscore names are mangled to the class name
        self.problem = self._HRSampler__build_problem()

        # Set up a map from reaction -> forward/reverse variable
        var_idx = {v: idx for idx, v in enumerate(self.model.variables)}
        self.var_idx = np.array(
            [var_idx[v] for v in self.model.variables])
        self.warmup = None
        if seed is None:
            self._seed = int(time())
        else:
            self._seed = seed
        # Avoid overflow
        self._seed = self._seed % np.iinfo(np.int32).max

    def generate_fva_warmup(self):
        """
        Adapted from cobra.flux_analysis.sampling.py
        __________________________________________

        Generate the warmup points for the sampler.

        Generates warmup points by setting each flux as the sole objective
        and minimizing/maximizing it. Also caches the projection of the
        warmup points into the nullspace for non-homogeneous problems (only
        if necessary).
        """
        self.n_warmup = 0
        idx = np.hstack([self.var_idx])
        self.warmup = np.zeros((len(idx), len(self.model.variables)))
        self.model.objective = S.Zero
        self.model.objective.direction = "max"
        variables = self.model.variables
        for i in idx:
            # Omit fixed reactions
            if self.problem.variable_fixed[i]:
                self.model.logger.info("skipping fixed variable %s" % variables[i].name)
                continue
            self.model.objective.set_linear_coefficients({variables[i]: 1})
            self.model.slim_optimize()
            if not self.model.solver.status == OPTIMAL:
                self.model.logger.info(
                    "can not maximize variable %s, skipping it" % variables[
                        i].name)
                continue
            primals = self.model.solver.primal_values
            sol = [primals[v.name] for v in self.model.variables]
            self.warmup[self.n_warmup,] = sol
            self.n_warmup += 1
            # revert objective
            self.model.objective.set_linear_coefficients({variables[i]: 0})
        # Shrink warmup points to measure
        self.warmup = shared_np_array((self.n_warmup, len(variables)),
                                      self.warmup[0:self.n_warmup, ])


# Next, we redefine the analysis class as both inheriting from the
# GeneralizedHRSampler, and then the original classes. This will overwrite the
# inherited methods from HRSampler, while keeping the lower level ones from the
# samplers

[docs]class GeneralizedACHRSampler(GeneralizedHRSampler,ACHRSampler):
    def __init__(self, model, thinning=100, seed=None):
        """
        Adapted from cobra.flux_analysis.analysis
        __________________________________________
        Initialize a new ACHRSampler."""
        GeneralizedHRSampler.__init__(self, model, thinning, seed=seed)
        self.generate_fva_warmup()
        self.prev = self.center = self.warmup.mean(axis=0)
        np.random.seed(self._seed)

[docs]class GeneralizedOptGPSampler(GeneralizedHRSampler, OptGPSampler):
    def __init__(self, model, processes, thinning=100, seed=None):
        """
        Adapted from cobra.flux_analysis.sampling.py
        __________________________________________
        Initialize a new OptGPSampler."""
        GeneralizedHRSampler.__init__(self, model, thinning, seed=seed)
        self.generate_fva_warmup()
        self.processes = processes

        # This maps our saved center into shared memory,
        # meaning they are synchronized across processes
        self.center = shared_np_array((len(self.model.variables), ),
                                      self.warmup.mean(axis=0))


[docs]def sample(model, n, method="optgp", thinning=100, processes=1, seed=None):
    """
    Sample valid flux distributions from a thermo cobra_model.

    Function adapted from cobra.flux_analysis.sample to display all solver
    variables

    **Documentation adapted from cobra.flux_analysis.sample**

    1. 'optgp' (default) which uses the OptGPSampler that supports parallel
        analysis [1]_. Requires large numbers of samples to be performant
        (n < 1000). For smaller samples 'achr' might be better suited.

    or

    2. 'achr' which uses artificial centering hit-and-run. This is a single
       process method with good convergence [2]_.

    Parameters
    ----------
    model : pytfa.core.ThermoModel
        The cobra_model from which to sample variables.
    n : int
        The number of samples to obtain. When using 'optgp' this must be a
        multiple of `processes`, otherwise a larger number of samples will be
        returned.
    method : str, optional
        The analysis algorithm to use.
    thinning : int, optional
        The thinning factor of the generated analysis chain. A thinning of 10
        means samples are returned every 10 steps. Defaults to 100 which in
        benchmarks gives approximately uncorrelated samples. If set to one
        will return all iterates.
    processes : int, optional
        Only used for 'optgp'. The number of processes used to generate
        samples.
    seed : positive integer, optional
        The random number seed to be used. Initialized to current time stamp
        if None.

    Returns
    -------
    pandas.DataFrame
        The generated flux samples. Each row corresponds to a sample of the
        fluxes and the columns are the reactions.

    Notes
    -----
    The samplers have a correction method to ensure equality feasibility for
    long-running chains, however this will only work for homogeneous models,
    meaning models with no non-zero fixed variables or constraints (
    right-hand side of the equalities are zero).

    References
    ----------
    .. [1] Megchelenbrink W, Huynen M, Marchiori E (2014)
       optGpSampler: An Improved Tool for Uniformly Sampling the Solution-Space
       of Genome-Scale Metabolic Networks.
       PLoS ONE 9(2): e86587.
    .. [2] Direction Choice for Accelerated Convergence in Hit-and-Run Sampling
       David E. Kaufman Robert L. Smith
       Operations Research 199846:1 , 84-95
    """
    if method == "optgp":
        sampler = GeneralizedOptGPSampler(model, processes, thinning=thinning, seed=seed)
    elif method == "achr":
        sampler = GeneralizedACHRSampler(model, thinning=thinning, seed=seed)
    else:
        raise ValueError("method must be 'optgp' or 'achr'!")

    return sampler.sample(n, fluxes = False)