Source code for metquest.execute_metquest

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import sys
from collections import Counter
from itertools import combinations
from metquest.pathway_assembler import find_pathways
from metquest.construct_graph import create_graph


[docs]def write_output_to_file(pathway_table, currenttarmet, cutoff, cyclic_pathways,
                         folder_to_create, namemap, source_metabolites, G):
    """
    This function writes the pathways of sizes less than or equal to the
    cutoff from source to the target and seed metabolites to target.
    This function also writes cyclic pathways of sizes less than or equal to
    cutoff from the source to target.

    Parameters
    ----------
    pathway_table : dict
        Dictionary of dictionary containing the pathways of different sizes
        identified for every metabolite. This will have only the acyclic/
        branched pathways.
    currenttarmet : str
        Current target metabolite
    cutoff : int
        Maximum pathway length cutoff
    cyclic_pathways : dict
        Dictionary of dictionary containing cyclic pathways of different sizes
        identified for every metabolite.
    folder_to_create : str
        Name of the folder where results have to be written
    namemap : dict
        Dictionary mapping the adhoc reaction names to reaction names in
        the model
    source_metabolites : list
        List of source metabolites
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network


    Returns
    -------
    None
    """
    pred = G.predecessors
    succ = G.successors
    all_pathways_count = []
    path_count = []
    cyclic_pathway_count = []
    if currenttarmet in pathway_table:
        for plen in pathway_table[currenttarmet]:
            all_pathways_count.append(
                len(pathway_table[currenttarmet][plen]))
            if plen <= int(cutoff):
                path_count.append(
                    len(pathway_table[currenttarmet][plen]))
        if currenttarmet in cyclic_pathways:
            for plen in cyclic_pathways[currenttarmet]:
                if plen <= int(cutoff):
                    cyclic_pathway_count.append(
                        len(cyclic_pathways[currenttarmet][plen]))
            cycfname = folder_to_create + 'cyclic_pathways_' + currenttarmet.replace(' ', '') + \
                '_' + 'leq_plen_' + str(cutoff) + '.txt'
            pathnumcount = 0
            with open(cycfname, 'w') as filetowrite:
                print('\nWriting cyclic pathways to a file')
                for idx in cyclic_pathways[currenttarmet]:
                    if idx <= int(cutoff):
                        filetowrite.write('Path length ' +
                                          str(idx) + '\n')
                    for items in cyclic_pathways[currenttarmet][idx]:
                        pathnumcount += 1
                        filetowrite.write(
                            str(pathnumcount) + '\n')
                        for entities in list(items):
                            filetowrite.write(
                                namemap[entities] + '\t' + ' + '.join(pred(entities)) +
                                '->' + ' + '.join(succ(entities)))
                            filetowrite.write('\n')
                        filetowrite.write('--------------------\n')

        if int(cutoff) in pathway_table[currenttarmet]:
            for sourcemets in source_metabolites:
                seedfname = folder_to_create + 'branched_pathways_from_seed_' + \
                        currenttarmet.replace(' ', '') + \
                        '_' + 'leq_plen_' + str(cutoff) + '.txt'
                pathnumcount = 0
                with open(seedfname, 'w') as filetowrite:
                    print('Writing branched pathways (from seed) to a file')
                    for idx in pathway_table[currenttarmet]:
                        if idx <= int(cutoff):
                            filetowrite.write('Path length ' +
                                              str(idx) + '\n')
                            for items in pathway_table[currenttarmet][idx]:
                                pathnumcount += 1
                                filetowrite.write(
                                    str(pathnumcount) + '\n')
                                for entities in list(items):
                                    filetowrite.write(
                                        namemap[entities] + '\t' +
                                        ' + '.join(pred(entities))
                                        + '->' + ' + '.join(succ(entities)))
                                    filetowrite.write('\n')
                                filetowrite.write(
                                    '--------------------\n')
                            filetowrite.write('--------------------\n')
                only_source_to_target = []

                for sourcemets in source_metabolites:
                    for idx in pathway_table[currenttarmet]:
                        if idx <= int(cutoff):
                            for items in pathway_table[currenttarmet][idx]:
                                if set(succ(sourcemets)).intersection(items):
                                    only_source_to_target.append(
                                        list(items))
                    if only_source_to_target:
                        sourcefname = folder_to_create + 'branched_pathways_from_source_' + \
                                    currenttarmet.replace(' ', '') + \
                                    '_' + 'leq_plen_' + str(cutoff) + '.txt'

                        with open(sourcefname, 'w') as filetowrite:
                            print('Writing branched pathways (from source) to a file \n')
                            for currentidx, listentries in enumerate(only_source_to_target):
                                filetowrite.write(str(currentidx+1) + '\n')
                                filetowrite.write('Path length ' +
                                                  str(len(listentries)) + '\n')
                                for entities in listentries:
                                    filetowrite.write(
                                        namemap[entities] + '\t' +
                                        ' + '.join(pred(entities))
                                        + '->' + ' + '.join(succ(entities)))
                                    filetowrite.write('\n')
                                filetowrite.write('--------------------\n')
    else:
        print(currenttarmet, ': Target could not be found.')
        print('Consider changing the cut-off or the seed metabolite set')


[docs]def find_pathways_starting_from_source(source_metabolites, pathway_table, currenttarmet, cutoff, G):
    """
    This function finds all pathways starting from the source metabolites

    Parameters
    ----------
    source_metabolites : list
        List of source metabolites
    pathway_table : dict
        Dictionary of dictionary containing the pathways of different sizes
        identified for every metabolite. This will have only the acyclic/
        branched pathways.
    currenttarmet : str
        Current target metabolite
    cutoff : int
        Maximum pathway length cutoff
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network

    Returns
    -------
    most_different_paths : dict
        For the given source metabolite, a combination of two most different pathways
        based on minimum Jaccard value is returned.
    only_source_to_target : list
        list of list containing all pathways starting from source metabolite

    """

    succ = G.successors
    most_different_paths = {}
    only_source_to_target = []
    if currenttarmet in pathway_table:
        for sourcemets in source_metabolites:
            for idx in pathway_table[currenttarmet]:
                if idx <= int(cutoff):
                    for items in pathway_table[currenttarmet][idx]:
                        if set(succ(sourcemets)).intersection(items):
                            only_source_to_target.append(
                                list(items))
            if len(only_source_to_target) > 1:
                # Sometimes there can be only one pathway producing target
                # To find most different paths from source
                j_value, rxn_comb = find_jaccard_between_paths(
                    only_source_to_target)
                min_j_index = j_value.index(min(j_value))
                most_different_paths[sourcemets] = rxn_comb[min_j_index]
    else:
        print(currenttarmet, ': Target could not be found.')
        print('Consider changing the cut-off or the seed metabolite set')
    return most_different_paths, only_source_to_target



[docs]def print_summary(scope, currenttarmet, pathway_table, cutoff, cyclic_pathways, namemap,
                  source_metabolites, seed_metabolites, number_of_xml, G):
    """
    This function prints the results summary obtained from the pathways, i.e.,
    1. Number of metabolites in scope
    2. Target metabolite
    3. Pathway size cutoff
    4. Number of all branched pathways found from seed
    5. Number of all branched pathways from seed whose size <= Pathway size cutoff
    6. Minimum number of steps to produce target metabolite
    7. Number of branched pathways from source whose size <= Pathway size cutoff
    8. Target metabolite can be produced using cyclic pathway
    9. Number of cyclic pathways whose size <= Pathway size cutoff
    10. One of the combination of most different pathways producing target metabolite
    11. Important reactions based on the frequency of occurrences

    Parameters
    ----------
    scope : set
        Set of metabolites that can be produced from the given set of
        seed metabolites
    currenttarmet : str
        Current target metabolite
    pathway_table : dict
        Dictionary of dictionary containing the pathways of different sizes
        identified for every metabolite. This will have only the acyclic/
        branched pathways.
    cutoff : int
        Maximum pathway length cutoff
    cyclic_pathways : dict
        Dictionary of dictionary containing cyclic pathways of different sizes
        identified for every metabolite.
    namemap : dict
        Dictionary mapping the adhoc reaction names to reaction names in
        the model
    source_metabolites : list
        List of source metabolites
    seed_metabolites : set
        Set of seed metabolites including the source
    number_of_xml : int
        Number of xml files in the folder
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network

    Returns
    -------
    None

    """
    pred = G.predecessors
    succ = G.successors
    all_pathways_count = []
    path_count = []
    cyclic_pathway_count = []
    print('\n')
    print('---------------')
    print('Summary')
    print('---------------')
    print('Number of metabolites in scope : ', len(scope))
    print('Target metabolite : ', currenttarmet)
    print('Pathway size cutoff : ', str(cutoff))
    if currenttarmet in pathway_table:
        for plen in pathway_table[currenttarmet]:
            all_pathways_count.append(
                len(pathway_table[currenttarmet][plen]))
            if plen <= int(cutoff):
                path_count.append(
                    len(pathway_table[currenttarmet][plen]))
        print('Number of all branched pathways found from seed', ':', sum(all_pathways_count))
        print('Number of all branched pathways from seed whose size <=', cutoff, ':', sum(path_count))
        minsteps = min(pathway_table[currenttarmet])
        print('Minimum number of steps to produce ',
              currenttarmet, ' : ', int(minsteps))
        # Finding essential reactions and exchange metabolites
        all_reactions_involved = []
        for plen in pathway_table[currenttarmet]:
            for pathways in pathway_table[currenttarmet][plen]:
                for reactions in pathways:
                    all_reactions_involved.append(reactions)
        exchange_candidates_inverted_dict = find_pathways_involving_exchange_mets(number_of_xml,
                                                                                  pathway_table,
                                                                                  currenttarmet,
                                                                                  seed_metabolites,
                                                                                  namemap, G)
        most_different_paths, only_source_to_target = find_pathways_starting_from_source(source_metabolites,
                                                                                         pathway_table,
                                                                                         currenttarmet,
                                                                                         cutoff, G)
        # Two most different paths
        if most_different_paths:
            print('Number of branched pathways from source whose size <=',
                  cutoff, ':', len(only_source_to_target))

        if currenttarmet in cyclic_pathways:
            print(currenttarmet, 'can be produced using cyclic pathway')
            for plen in cyclic_pathways[currenttarmet]:
                if plen <= int(cutoff):
                    cyclic_pathway_count.append(
                        len(cyclic_pathways[currenttarmet][plen]))
            print('Number of cyclic pathways whose size <=',
                  cutoff, ':', sum(cyclic_pathway_count))
        else:
            print(currenttarmet, 'cannot be produced using cyclic pathway')
        if most_different_paths:
            print('\n')
            print('One of the combination of most different pathways producing target metabolite')
            print('Note - There can be other combinations that can be found')
            print('For finding all the combinations, please use the function find_jaccard_between_paths')
            for sourcemets in most_different_paths:
                counterpaths = 0
                for pathway in most_different_paths[sourcemets]:
                    counterpaths += 1
                    print('Pathway', str(counterpaths))
                    pathway.sort()
                    for rxns in pathway:
                        print(namemap[rxns], ' + '.join(list(pred(rxns))), '-->',
                              ' + '.join(list(succ(rxns))))
                    print('\n')
        else:
            print('No/only one pathway starting from source')
            print('Two most different paths from source : None')
        find_important_reactions(all_reactions_involved, currenttarmet, seed_metabolites, namemap, G)
    else:
        print(currenttarmet, ': Target could not be found.')
        print('Consider changing the cut-off or the seed metabolite set')


[docs]def find_important_reactions(all_reactions_involved, currenttarmet, seed_metabolites, namemap, G):
    """
    This function determines the important reactions based on the pathways
    generated for the target metabolite.

    Parameters
    ----------
    all_reactions_involved : list
        list of all reactions found in all the pathways from source to target
    currenttarmet : str
        Current target metabolite
    seed_metabolites : set
        Set of seed metabolites including the source
    namemap : dict
        Dictionary mapping the adhoc reaction names to reaction names in
        the model
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network


    Returns
    -------
    None

    Notes
    -----
    We define important reactions as those reactions which occur in almost all
    the pathways producing the target metabolite (apart from the reactions that
    are involved in the production of target metabolite and the uptake of seed
    metabolite)

    """
    pred = G.predecessors
    important_candidates = Counter(all_reactions_involved)
    important_reactions_inverted_dict = {}
    for keys, values in important_candidates.items():
        important_reactions_inverted_dict[values] = \
            important_reactions_inverted_dict.get(values, [])
        important_reactions_inverted_dict[values].append(keys)
    number_of_occurrences = list(important_reactions_inverted_dict.keys())
    number_of_occurrences.sort()
    top_candidate = number_of_occurrences[:-6:-1]

    important_reactions = []
    for numrepeats in top_candidate:
        for rxns in important_reactions_inverted_dict[numrepeats]:
            if rxns not in pred(currenttarmet):
                if not set(pred(rxns)).issubset(seed_metabolites):
                    important_reactions.append(rxns)
    important_reactions_model_names = []
    for rxns in important_reactions:
        important_reactions_model_names.append(namemap[rxns])
    important_reactions_model_names.sort()
    if important_reactions:
        print('Important reactions based on the frequency of occurrence are')
        print('\n'.join(important_reactions_model_names))
    else:
        print('All reactions pertain to uptake of seed metabolite/ production of target metabolite')


[docs]def find_pathways_involving_exchange_mets(number_of_xml, pathway_table, currenttarmet,
                                          seed_metabolites, namemap, G):
    """
    This function identifies the pathways producing the target metabolites,
    which involve exchange metabolites. This function prints output only when
    a community of organisms is considered, i.e., when more than one metabolic
    network is used.

    Parameters
    ----------
    number_of_xml : int
        Number of xml files in the folder
    pathway_table : dict
        Dictionary of dictionary containing the pathways of different sizes
        identified for every metabolite. This will have only the acyclic/
        branched pathways.
    currenttarmet : str
        Current target metabolite
    seed_metabolites : set
        Set of seed metabolites including the source
    namemap : dict
        Dictionary mapping the adhoc reaction names to reaction names in
        the model
    G : NetworkX DiGraph Object
        Bipartite graph of the metabolic network

    Returns
    -------
    exchange_candidates_inverted_dict : dict
        Dictionary containing the number of times an exchange reaction is
        repeated

    """
    pred = G.predecessors
    succ = G.successors
    exchange_reactions = []
    exchange_candidates_inverted_dict = {}
    if number_of_xml > 1:
        for plen in pathway_table[currenttarmet]:
            for pathways in pathway_table[currenttarmet][plen]:
                for reactions in pathways:
                    if not set(pred(reactions)).issubset(seed_metabolites):
                        #  'ER' is an adhoc reaction name assigned to exchange
                        #  reactions in the models.
                        if 'ER' in reactions:
                            exchange_reactions.append(reactions)
        exchange_candidates = Counter(exchange_reactions)

        for keys, values in exchange_candidates.items():
            exchange_candidates_inverted_dict[values] = \
                exchange_candidates_inverted_dict.get(values, [])
            exchange_candidates_inverted_dict[values].append(keys)
        number_of_occurrences = list(exchange_candidates_inverted_dict.keys())
        number_of_occurrences.sort()
        # [:-6:-1] # Taking the top reaction
        top_candidate = number_of_occurrences[:-10:-1]
        exchange_reactions = []
        for numrepeats in top_candidate:
            for rxns in exchange_candidates_inverted_dict[numrepeats]:
                if rxns not in pred(currenttarmet):
                    if not set(pred(rxns)).issubset(seed_metabolites):
                        exchange_reactions.append(rxns)
        if exchange_reactions:
            print('Exchange reactions are')
            for rxns in exchange_reactions:
                print(namemap[rxns], list(pred(rxns)), list(succ(rxns)))
        else:
            print('No metabolite exchanged')
    return exchange_candidates_inverted_dict


[docs]def find_jaccard_between_paths(only_source_to_target):
    """
    This function determines the jaccard values between the pathways generated
    from the source to the target.

    Parameters
    ----------
    only_source_to_target : list
        list of lists consisting of all pathways producing the target
        metabolite from the source

    Returns
    -------
    jaccard_values : list
        list of all jaccard values (float) for all the pathway combinations
    path_combinations : list
        list of all pathway combinations corresponding to the jaccard values

    Notes
    -----
    Jaccard value J = (set(A).intersection(set(B)))/(set(A).union(set(B)))
    J = 1 indicates two sets are the same
    J = 0 indicates two sets are different

    """

    jaccard_values = []
    path_combinations = []
    for reactionlists in combinations(only_source_to_target, 2):
        j_value = len(set(reactionlists[0]).intersection(
            set(reactionlists[1])))/len(set(reactionlists[0]).union(set(reactionlists[1])))
        jaccard_values.append(j_value)
        path_combinations.append(reactionlists)
    return jaccard_values, path_combinations


[docs]def execute_all_codes():
    """
    This function executes all the codes including constructing graphs and executing metquest.

    Parameters
    ----------
    None

    Returns
    -------
    None

    """
    try:
        inputfoldername = sys.argv[1]
    except:
        inputfoldername = input('Enter folder name with all files \n')

    if '~' in inputfoldername:
        inputfoldername = os.path.expanduser(inputfoldername)
    list_of_files = []
    try:
        list_of_files = next(os.walk(inputfoldername))[1]
        if list_of_files:
        #    list_of_files = os.listdir(inputfoldername)   
            for foldernames in list_of_files:
                
                # To go through only folders
                if os.path.isdir(os.path.join(inputfoldername, foldernames)):
                    current_evaluation_folder = os.path.join(
                        inputfoldername, foldernames)
                    number_of_files_in_current_folder = os.listdir(
                        current_evaluation_folder)
                    number_of_xml = len(
                        [filenames for filenames in number_of_files_in_current_folder if '.xml' in filenames])
                    print('Currently evaluating files in', foldernames)
                    print('Number of networks', number_of_xml)
                    G, namemap = create_graph(
                        current_evaluation_folder, number_of_xml)
                    for files in os.listdir('.'):
                        if files.endswith('.txt'):
                            if files.startswith('seed'):
                                with open(files, 'r') as seedfile:
                                    seedmetslist = seedfile.read().splitlines()
                                seed_metabolites = set(seedmetslist)
                            elif files.startswith('source'):
                                with open(files, 'r') as sourcefile:
                                    source_metabolites = sourcefile.read().splitlines()
                            elif files.startswith('target'):
                                with open(files, 'r') as targetfile:
                                    # Target metabolites can be multiple values
                                    targetmetabolites = targetfile.read().splitlines()
                            elif files.startswith('cutoff'):
                                with open(files, 'r') as cutofffile:
                                    cutoff_list = cutofffile.read().splitlines()  # Cutoff can be multiple values
                    for mets in source_metabolites:
                        seed_metabolites.add(mets)
                    metfoundingraph = True
                    for metabs in seed_metabolites:
                        if metabs not in G:
                            print(metabs, 'not in G. MetQuest will not be executed')
                            print('Please check the metabolite names')
                            metfoundingraph = False
                            break
                    for metabs in targetmetabolites:
                        if metabs not in G:
                            print(metabs, 'not in G. MetQuest will not be executed')
                            print('Please check the metabolite names')
                            metfoundingraph = False
                            break
                    if metfoundingraph:
                        folder_to_create = 'Results/'
                        if not os.path.exists(folder_to_create):
                            os.makedirs(folder_to_create)
                        for currenttarmet in targetmetabolites:  # multiple target mets
                            for cutoff in cutoff_list:  # multiple cutoffs
                                pathway_table, cyclic_pathways, scope = find_pathways(
                                    G, seed_metabolites, int(cutoff))
                                print_summary(scope, currenttarmet, pathway_table, cutoff, cyclic_pathways,
                                              namemap, source_metabolites, seed_metabolites,
                                              number_of_xml, G)
                                write_output_to_file(pathway_table, currenttarmet, cutoff,
                                                     cyclic_pathways, folder_to_create,
                                                     namemap, source_metabolites, G)
                        print('\n')
                    os.chdir('../')
        else:
            print('Folder with data files not found')
            print('Please follow the prescribed folder format')
    except:
        print('Path name is incorrect. Please check the path name')