Source code for metquest.pathway_assembler

# -*- coding: utf-8 -*-

from __future__ import absolute_import

import math
import itertools
from time import clock
from numpy import prod
from networkx import get_node_attributes
from metquest.guided_bfs import forward_pass
from metquest.generate_partitions import generate_partitions


[docs]def find_pathways(G, seed_mets_input, path_len_cutoff, *args):
    """
    This function tries to identify pathways between a set of seed and
    target metabolites of a given size cut-off.

    Parameters
    ----------
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network
    seed_mets_input : set
        Set of seed metabolites including the source
    path_len_cutoff : int
        Maximum size of the pathways
    *args
        Used to decide if a particular combination has to be evaluated or not.
        i.e., if the number of pathways produced for two different metabolites
        are higher, for instance, if in the reaction A + B -> C,
        A has 2000 pathways and B has 5 pathways, then C will have 10000 pathways
        at the maximum. If this pathway cutoff (maxnumpath) is 1000,
        this combination will not be evaluated, provided C has been
        already found.
        By default, it is set to 1000

    Returns
    -------
    pathway_table : dict
        Dictionary of dictionary containing the pathways of different sizes
        identified for every metabolite. This will have only the acyclic/
        branched pathways.
    cyclic_pathways : dict
        Dictionary of dictionary containing cyclic pathways of different sizes
        identified for every metabolite.
    scope : set
        Set of metabolites which can be synthesised
    """

    global succ, pred, lower_bound_metabolite, maxnumpath, seedmets, \
        pathway_table, cyclic_pathways
    pathway_table = {}
    cyclic_pathways = {}
    tic = clock()
    #  Setting the cutoff for maximum number of pathways
    if args:
        for maxnumpath_input in args:
            maxnumpath = maxnumpath_input
    else:
        maxnumpath = 1000
    # DiGraph definitions successors and predecessors
    succ = G.successors
    pred = G.predecessors
    seedmets = seed_mets_input
    # Removing reactions whose reactants are more than 5
    node_attributes = get_node_attributes(G, 'bipartite')
    node_attributes_inverted_dict = {}
    # 0 metabolites , 1 are reactions
    for keys, values in node_attributes.items():
        node_attributes_inverted_dict[values] = \
            node_attributes_inverted_dict.get(values, [])
        node_attributes_inverted_dict[values].append(keys)
    # If the number of metabolites, apart from the ones provided
    # in seed are greater than 5, such reactions are removed
    for rxnstoremove in node_attributes_inverted_dict[1]:
        if len(set(pred(rxnstoremove)) - seedmets) >= 5:
            G.remove_node(rxnstoremove)
    # Performing guided BFS on directed graph by calling forward_pass
    lower_bound_metabolite, status_dict, scope = forward_pass(G, seedmets)
    # Sorting the keys (reactions) in the status dictionary,
    # since dictionary keys are not good to iterate over.
    # There could be differences in the order of insertion of
    # dictionary keys. Although, this does not matter with the
    # algorithm implementation.
    rxns_to_visit = list(status_dict.keys())
    rxns_to_visit.sort()
    # For seed metabolites, the pathway table is initialised to 0
    for seedmetabs in list(seedmets):
        pathway_table[seedmetabs] = {0: ''}
    # Status dict consists of all the reactions that can be
    # visited from the seed metabolites
    for rxns in rxns_to_visit:
        if set(pred(rxns)).issubset(seedmets):
            # Initialisation of dictionary with the
            # metabolites produced with one rxn
            for metssucc in succ(rxns):
                if metssucc not in pathway_table:
                    pathway_table[metssucc] = {1: []}
            # Filling table with one reaction that produced metabolite
            for metssucc in succ(rxns):
                # Since we don't want pathways generating seed metabolites
                if metssucc not in seedmets:
                    pathway_table[metssucc][1].append(set([rxns]))

	# For filling values from the second column
    for currentcolumnidx in range(2, path_len_cutoff+1):
        for rxns in status_dict:  # rxns_to_visit:
            # To eliminate seed metabolites, whose column value
            # is always 0 - so that more partitions are not generated.
            mets_needed = list(set(pred(rxns)) - seedmets)
            # To only go over reactions whose inputs are not
            # seed metabolites. There could be reactions whose inputs
            # are only seed mets, eg atp + h2o
            if mets_needed:
                for val in range(currentcolumnidx-1,
                                 len(mets_needed)*(currentcolumnidx-1)+1):
                    if val <= len(mets_needed)*(currentcolumnidx-2):
                        _first_round_calculations(
                            mets_needed, currentcolumnidx, rxns, val)
                    else:
                        _second_round_calculations(
                            mets_needed, currentcolumnidx, rxns, val)
    toc = clock()
    timetaken = toc - tic
    print('Time taken', timetaken)
    return pathway_table, cyclic_pathways, scope


def _first_round_calculations(mets_needed, currentcolumnidx, rxns, val):
    """
    This function takes as input metabolites required by the reaction,
    current column index (pathway length) which we are trying to fill,
    the current reaction which we are considering and the sum that we
    need to generate. This function tries to calculate the upper limit
    on the number of inputs that can take the value of (k-1) and generate
    partitions which have not been previously generated. For more
    details, please refer main manuscript (Algorithm 1 - Lines 14-19)

    Parameters
    ----------
    mets_needed : list
        List of metabolites a reaction requires
    currentcolumnidx : int
        An integer denoting the current column which is evaluated
    rxns : str
        Current reaction which is evaluated
    val : int
        Maximum sum that is to be generated

    Returns
    -------
    None
    """

    # Line 15 in the algorithm - Upper limit on the number of
    # inputs that can take the value of (k-1)
    optimized_val = int(math.floor(val/((currentcolumnidx-1))))
    for currentval in range(1, optimized_val+1):
        # GENERATING COMBINATIONS
        # This will give combinations of  metabolites whose size is
        # defined by currentval
        for currentmetcomb in itertools.combinations(mets_needed, currentval):
            temp_rxn_list = []
            number_of_pathways_found = {}
            onemetnotfound = ''
            for metabolites in list(currentmetcomb):
                if metabolites in pathway_table:
                    # To ensure that the current iteration uses metabs
                    # generated only till the previous iteration
                    if currentcolumnidx - 1 in pathway_table[metabolites]:
                        temp_rxn_list.append([[rxns]])
                        number_of_pathways_found[metabolites] = \
                            len(pathway_table[metabolites][currentcolumnidx-1])
                    else:
                        # Because one of the metabolites is not found,
                        # hence breaks out of the second loop
                        onemetnotfound = 'Y'
                        break
            if onemetnotfound != 'Y':
                other_mets_not_in_comb = list(set(mets_needed) - set(currentmetcomb))
                if set(currentmetcomb).issubset(pathway_table):
                    for mets in currentmetcomb:
                        # Assigning the value of columnindex -1
                        # to the metabolites in current metabolite combination
                        temp_rxn_list.append(pathway_table[mets][currentcolumnidx-1])

                    first_discovery_step = []
                    # This will give values of the lower bound of
                    # metabolites which are not involved in combination
                    for varmet in list(other_mets_not_in_comb):
                        first_discovery_step.append(min(lower_bound_metabolite[varmet]))
                    all_partitions = generate_partitions(val-((currentcolumnidx-1)*currentval),
                                                         first_discovery_step, currentcolumnidx - 1)
                    for partitions in all_partitions:
                        _find_all_rxn_combination_firstround(rxns, partitions, other_mets_not_in_comb,
                                                             temp_rxn_list, number_of_pathways_found,
                                                             currentcolumnidx)


def _find_all_rxn_combination_firstround(rxns, partitions, other_mets_not_in_comb,
                                        temp_rxn_list, number_of_pathways_found, currentcolumnidx):
    """
    This function fetches the pathways from the table corresponding to
    the entries in partition generated.

    Parameters
    ----------
    rxns : str
        Current reaction which is evaluated
    paritions : tuple
        Combinations of numbers that would geenrate the required sum
    other_mets_not_in_comb : list
        other metabolites participating in the reaction for which a
        number from the partition has not been assigned yet
    temp_rxn_list : list
        a list consisting of all pathways that generated the metabolite
        for which the value of (k-1) is assigned
    number_of_pathways_found : dict
        number of pathways found for the metabolite evaluated
    currentcolumnidx : int
         value of the current column index (pathway length)

    Returns
    -------
    None
    """
    # To check if all the other metabolites required by that reaction
    # can be generated using the partitions.
    counter = 0
    more_pathways_found = ''
    for varmetidx in range(len(other_mets_not_in_comb)):
        if other_mets_not_in_comb[varmetidx] in pathway_table:
            # checking if the pathway table consists of values from the
            # current partition for the input metabolite
            if partitions[varmetidx] in pathway_table[other_mets_not_in_comb[varmetidx]]:
                counter += 1
                number_of_pathways_found[other_mets_not_in_comb[varmetidx]] = \
                    len(pathway_table[other_mets_not_in_comb[varmetidx]][partitions[varmetidx]])
    if counter == len(other_mets_not_in_comb):
        if all([prod(list(number_of_pathways_found.values())) > maxnumpath,
                set(succ(rxns)).issubset(set(pathway_table))]):
            more_pathways_found = 'Y'
        else:
            # Deep copy of the reaction list, because temp_rxn_list_current
            # varies with every iteration to evaluate other partitions
            temp_rxn_list_current = temp_rxn_list[:]
            for varmetidx in range(len(other_mets_not_in_comb)):
                temp_rxn_list_current.append(
                    pathway_table[other_mets_not_in_comb[varmetidx]][partitions[varmetidx]])
            _populate_table(rxns, temp_rxn_list_current, currentcolumnidx)


def _populate_table(rxns, temp_rxn_list_current, currentcolumnidx):
    """
    This function fills in the entry in the main pathway table. It also
    evaluates the pathways and identifies if its cyclic. If the
    pathway is a cyclic pathway, it is stored separately, and is not
    used in the subsequent calculations.

    Parameters
    ----------
    rxns : str
        Current reaction which is evaluated
    paritions : tuple
        Combinations of numbers that would geenrate the required sum
    temp_rxn_list_current : list of lists
        a list of lists consisting of all the alternate pathways
        producing the metabolites required by the reaction
    currentcolumnidx : int
         value of the current column index (pathway length)

    Returns
    -------
    None
    """
    #  Temprxnlist consists of all combinations of pathways
    #  producing all the input metabolites
    for rxnunion in itertools.product(*temp_rxn_list_current):
        reaction_combntn = set([])
        for rxnentry in rxnunion:
            for individualele in rxnentry:
                reaction_combntn.add(individualele)
        metabs_in_cycle = set([])

        for currentrxn in reaction_combntn:
            for inputmetab in pred(currentrxn):
                if inputmetab not in seedmets:
                    metabs_in_cycle.add(inputmetab)
        for succmets in succ(rxns):
            if succmets not in seedmets:
                if len(reaction_combntn) >= currentcolumnidx:
                    if succmets in pathway_table:
                        if succmets in metabs_in_cycle:
                            if succmets in cyclic_pathways:
                                cyclic_pathways[succmets].update(
                                    {len(reaction_combntn): [list(reaction_combntn)]})
                            else:
                                cyclic_pathways[succmets] = \
                                    {len(reaction_combntn): [list(reaction_combntn)]}
                        else:
                            if len(reaction_combntn) in pathway_table[succmets]:
                                try:
                                    if pathway_table[succmets][len(reaction_combntn)]. \
                                                                index(reaction_combntn):
                                        pass  # Because this entry is already in the pathway_table
                                except ValueError:
                                    pathway_table[succmets][len(reaction_combntn)].append(
                                        reaction_combntn)
                            else:
                                pathway_table[succmets].update(
                                    {len(reaction_combntn): [reaction_combntn]})

                    else:
                        pathway_table[succmets] = {len(reaction_combntn): [reaction_combntn]}

def _second_round_calculations(mets_needed, currentcolumnidx, rxns, val):
    """
    This function takes into account all the metabolites required by the
    reaction and based on the partition of numbers, fetches the values
    from the pathway table. Since there can be multiple alternate
    routes to produce the same metabolite, this function makes a list
    of all possible pathways (of the given size) that produces this
    metabolite of interest.

    Parameters
    ----------
    mets_needed : list
        List of metabolites a reaction requires
    currentcolumnidx : int
        An integer denoting the current column which is evaluated
    rxns : str
        Current reaction which is evaluated
    val : int
        Maximum sum that is to be generated

    Returns
    -------
    None
    """
    first_discovery_step = []
    for predmets in mets_needed:
        first_discovery_step.append(min(lower_bound_metabolite[predmets]))
    all_partitions = generate_partitions(val, first_discovery_step, currentcolumnidx-1)
    for partitions in all_partitions:
        temp_rxn_list = []
        number_of_pathways_found = {}
        more_pathways_found = ''
        counter_new = 0
        temp_rxn_list.append([[rxns]])
        for item in range(len(mets_needed)):
            if mets_needed[item] in pathway_table:
                if partitions[item] in pathway_table[mets_needed[item]]:
                    number_of_pathways_found[mets_needed[item]] = \
                        len(pathway_table[mets_needed[item]][partitions[item]])
                    counter_new += 1
        if counter_new == len(mets_needed):
            if all([prod(list(number_of_pathways_found.values())) > maxnumpath,
                    set(succ(rxns)).issubset(set(pathway_table))]):
                more_pathways_found = 'NA'
            else:
                for item in range(len(mets_needed)):
                    temp_rxn_list.append(pathway_table[mets_needed[item]][partitions[item]])
                _populate_table(rxns, temp_rxn_list, currentcolumnidx)