Source code for metquest.execute_metquest

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import sys
from collections import Counter
from itertools import combinations
from metquest.pathway_assembler import find_pathways
from metquest.construct_graph import create_graph


[docs]def write_output_to_file(pathway_table, currenttarmet, cutoff, cyclic_pathways, folder_to_create, namemap, source_metabolites, G): """ This function writes the pathways of sizes less than or equal to the cutoff from source to the target and seed metabolites to target. This function also writes cyclic pathways of sizes less than or equal to cutoff from the source to target. Parameters ---------- pathway_table : dict Dictionary of dictionary containing the pathways of different sizes identified for every metabolite. This will have only the acyclic/ branched pathways. currenttarmet : str Current target metabolite cutoff : int Maximum pathway length cutoff cyclic_pathways : dict Dictionary of dictionary containing cyclic pathways of different sizes identified for every metabolite. folder_to_create : str Name of the folder where results have to be written namemap : dict Dictionary mapping the adhoc reaction names to reaction names in the model source_metabolites : list List of source metabolites G : NetworkX DiGraph Object Bipartite graph of the metabolic network Returns ------- None """ pred = G.predecessors succ = G.successors all_pathways_count = [] path_count = [] cyclic_pathway_count = [] if currenttarmet in pathway_table: for plen in pathway_table[currenttarmet]: all_pathways_count.append( len(pathway_table[currenttarmet][plen])) if plen <= int(cutoff): path_count.append( len(pathway_table[currenttarmet][plen])) if currenttarmet in cyclic_pathways: for plen in cyclic_pathways[currenttarmet]: if plen <= int(cutoff): cyclic_pathway_count.append( len(cyclic_pathways[currenttarmet][plen])) cycfname = folder_to_create + 'cyclic_pathways_' + currenttarmet.replace(' ', '') + \ '_' + 'leq_plen_' + str(cutoff) + '.txt' pathnumcount = 0 with open(cycfname, 'w') as filetowrite: print('\nWriting cyclic pathways to a file') for idx in cyclic_pathways[currenttarmet]: if idx <= int(cutoff): filetowrite.write('Path length ' + str(idx) + '\n') for items in cyclic_pathways[currenttarmet][idx]: pathnumcount += 1 filetowrite.write( str(pathnumcount) + '\n') for entities in list(items): filetowrite.write( namemap[entities] + '\t' + ' + '.join(pred(entities)) + '->' + ' + '.join(succ(entities))) filetowrite.write('\n') filetowrite.write('--------------------\n') if int(cutoff) in pathway_table[currenttarmet]: for sourcemets in source_metabolites: seedfname = folder_to_create + 'branched_pathways_from_seed_' + \ currenttarmet.replace(' ', '') + \ '_' + 'leq_plen_' + str(cutoff) + '.txt' pathnumcount = 0 with open(seedfname, 'w') as filetowrite: print('Writing branched pathways (from seed) to a file') for idx in pathway_table[currenttarmet]: if idx <= int(cutoff): filetowrite.write('Path length ' + str(idx) + '\n') for items in pathway_table[currenttarmet][idx]: pathnumcount += 1 filetowrite.write( str(pathnumcount) + '\n') for entities in list(items): filetowrite.write( namemap[entities] + '\t' + ' + '.join(pred(entities)) + '->' + ' + '.join(succ(entities))) filetowrite.write('\n') filetowrite.write( '--------------------\n') filetowrite.write('--------------------\n') only_source_to_target = [] for sourcemets in source_metabolites: for idx in pathway_table[currenttarmet]: if idx <= int(cutoff): for items in pathway_table[currenttarmet][idx]: if set(succ(sourcemets)).intersection(items): only_source_to_target.append( list(items)) if only_source_to_target: sourcefname = folder_to_create + 'branched_pathways_from_source_' + \ currenttarmet.replace(' ', '') + \ '_' + 'leq_plen_' + str(cutoff) + '.txt' with open(sourcefname, 'w') as filetowrite: print('Writing branched pathways (from source) to a file \n') for currentidx, listentries in enumerate(only_source_to_target): filetowrite.write(str(currentidx+1) + '\n') filetowrite.write('Path length ' + str(len(listentries)) + '\n') for entities in listentries: filetowrite.write( namemap[entities] + '\t' + ' + '.join(pred(entities)) + '->' + ' + '.join(succ(entities))) filetowrite.write('\n') filetowrite.write('--------------------\n') else: print(currenttarmet, ': Target could not be found.') print('Consider changing the cut-off or the seed metabolite set')
[docs]def find_pathways_starting_from_source(source_metabolites, pathway_table, currenttarmet, cutoff, G): """ This function finds all pathways starting from the source metabolites Parameters ---------- source_metabolites : list List of source metabolites pathway_table : dict Dictionary of dictionary containing the pathways of different sizes identified for every metabolite. This will have only the acyclic/ branched pathways. currenttarmet : str Current target metabolite cutoff : int Maximum pathway length cutoff G : NetworkX DiGraph Object Bipartite graph of the metabolic network Returns ------- most_different_paths : dict For the given source metabolite, a combination of two most different pathways based on minimum Jaccard value is returned. only_source_to_target : list list of list containing all pathways starting from source metabolite """ succ = G.successors most_different_paths = {} only_source_to_target = [] if currenttarmet in pathway_table: for sourcemets in source_metabolites: for idx in pathway_table[currenttarmet]: if idx <= int(cutoff): for items in pathway_table[currenttarmet][idx]: if set(succ(sourcemets)).intersection(items): only_source_to_target.append( list(items)) if len(only_source_to_target) > 1: # Sometimes there can be only one pathway producing target # To find most different paths from source j_value, rxn_comb = find_jaccard_between_paths( only_source_to_target) min_j_index = j_value.index(min(j_value)) most_different_paths[sourcemets] = rxn_comb[min_j_index] else: print(currenttarmet, ': Target could not be found.') print('Consider changing the cut-off or the seed metabolite set') return most_different_paths, only_source_to_target
[docs]def find_important_reactions(all_reactions_involved, currenttarmet, seed_metabolites, namemap, G): """ This function determines the important reactions based on the pathways generated for the target metabolite. Parameters ---------- all_reactions_involved : list list of all reactions found in all the pathways from source to target currenttarmet : str Current target metabolite seed_metabolites : set Set of seed metabolites including the source namemap : dict Dictionary mapping the adhoc reaction names to reaction names in the model G : NetworkX DiGraph Object Bipartite graph of the metabolic network Returns ------- None Notes ----- We define important reactions as those reactions which occur in almost all the pathways producing the target metabolite (apart from the reactions that are involved in the production of target metabolite and the uptake of seed metabolite) """ pred = G.predecessors important_candidates = Counter(all_reactions_involved) important_reactions_inverted_dict = {} for keys, values in important_candidates.items(): important_reactions_inverted_dict[values] = \ important_reactions_inverted_dict.get(values, []) important_reactions_inverted_dict[values].append(keys) number_of_occurrences = list(important_reactions_inverted_dict.keys()) number_of_occurrences.sort() top_candidate = number_of_occurrences[:-6:-1] important_reactions = [] for numrepeats in top_candidate: for rxns in important_reactions_inverted_dict[numrepeats]: if rxns not in pred(currenttarmet): if not set(pred(rxns)).issubset(seed_metabolites): important_reactions.append(rxns) important_reactions_model_names = [] for rxns in important_reactions: important_reactions_model_names.append(namemap[rxns]) important_reactions_model_names.sort() if important_reactions: print('Important reactions based on the frequency of occurrence are') print('\n'.join(important_reactions_model_names)) else: print('All reactions pertain to uptake of seed metabolite/ production of target metabolite')
[docs]def find_pathways_involving_exchange_mets(number_of_xml, pathway_table, currenttarmet, seed_metabolites, namemap, G): """ This function identifies the pathways producing the target metabolites, which involve exchange metabolites. This function prints output only when a community of organisms is considered, i.e., when more than one metabolic network is used. Parameters ---------- number_of_xml : int Number of xml files in the folder pathway_table : dict Dictionary of dictionary containing the pathways of different sizes identified for every metabolite. This will have only the acyclic/ branched pathways. currenttarmet : str Current target metabolite seed_metabolites : set Set of seed metabolites including the source namemap : dict Dictionary mapping the adhoc reaction names to reaction names in the model G : NetworkX DiGraph Object Bipartite graph of the metabolic network Returns ------- exchange_candidates_inverted_dict : dict Dictionary containing the number of times an exchange reaction is repeated """ pred = G.predecessors succ = G.successors exchange_reactions = [] exchange_candidates_inverted_dict = {} if number_of_xml > 1: for plen in pathway_table[currenttarmet]: for pathways in pathway_table[currenttarmet][plen]: for reactions in pathways: if not set(pred(reactions)).issubset(seed_metabolites): # 'ER' is an adhoc reaction name assigned to exchange # reactions in the models. if 'ER' in reactions: exchange_reactions.append(reactions) exchange_candidates = Counter(exchange_reactions) for keys, values in exchange_candidates.items(): exchange_candidates_inverted_dict[values] = \ exchange_candidates_inverted_dict.get(values, []) exchange_candidates_inverted_dict[values].append(keys) number_of_occurrences = list(exchange_candidates_inverted_dict.keys()) number_of_occurrences.sort() # [:-6:-1] # Taking the top reaction top_candidate = number_of_occurrences[:-10:-1] exchange_reactions = [] for numrepeats in top_candidate: for rxns in exchange_candidates_inverted_dict[numrepeats]: if rxns not in pred(currenttarmet): if not set(pred(rxns)).issubset(seed_metabolites): exchange_reactions.append(rxns) if exchange_reactions: print('Exchange reactions are') for rxns in exchange_reactions: print(namemap[rxns], list(pred(rxns)), list(succ(rxns))) else: print('No metabolite exchanged') return exchange_candidates_inverted_dict
[docs]def find_jaccard_between_paths(only_source_to_target): """ This function determines the jaccard values between the pathways generated from the source to the target. Parameters ---------- only_source_to_target : list list of lists consisting of all pathways producing the target metabolite from the source Returns ------- jaccard_values : list list of all jaccard values (float) for all the pathway combinations path_combinations : list list of all pathway combinations corresponding to the jaccard values Notes ----- Jaccard value J = (set(A).intersection(set(B)))/(set(A).union(set(B))) J = 1 indicates two sets are the same J = 0 indicates two sets are different """ jaccard_values = [] path_combinations = [] for reactionlists in combinations(only_source_to_target, 2): j_value = len(set(reactionlists[0]).intersection( set(reactionlists[1])))/len(set(reactionlists[0]).union(set(reactionlists[1]))) jaccard_values.append(j_value) path_combinations.append(reactionlists) return jaccard_values, path_combinations
[docs]def execute_all_codes(): """ This function executes all the codes including constructing graphs and executing metquest. Parameters ---------- None Returns ------- None """ try: inputfoldername = sys.argv[1] except: inputfoldername = input('Enter folder name with all files \n') if '~' in inputfoldername: inputfoldername = os.path.expanduser(inputfoldername) list_of_files = [] try: list_of_files = next(os.walk(inputfoldername))[1] if list_of_files: # list_of_files = os.listdir(inputfoldername) for foldernames in list_of_files: # To go through only folders if os.path.isdir(os.path.join(inputfoldername, foldernames)): current_evaluation_folder = os.path.join( inputfoldername, foldernames) number_of_files_in_current_folder = os.listdir( current_evaluation_folder) number_of_xml = len( [filenames for filenames in number_of_files_in_current_folder if '.xml' in filenames]) print('Currently evaluating files in', foldernames) print('Number of networks', number_of_xml) G, namemap = create_graph( current_evaluation_folder, number_of_xml) for files in os.listdir('.'): if files.endswith('.txt'): if files.startswith('seed'): with open(files, 'r') as seedfile: seedmetslist = seedfile.read().splitlines() seed_metabolites = set(seedmetslist) elif files.startswith('source'): with open(files, 'r') as sourcefile: source_metabolites = sourcefile.read().splitlines() elif files.startswith('target'): with open(files, 'r') as targetfile: # Target metabolites can be multiple values targetmetabolites = targetfile.read().splitlines() elif files.startswith('cutoff'): with open(files, 'r') as cutofffile: cutoff_list = cutofffile.read().splitlines() # Cutoff can be multiple values for mets in source_metabolites: seed_metabolites.add(mets) metfoundingraph = True for metabs in seed_metabolites: if metabs not in G: print(metabs, 'not in G. MetQuest will not be executed') print('Please check the metabolite names') metfoundingraph = False break for metabs in targetmetabolites: if metabs not in G: print(metabs, 'not in G. MetQuest will not be executed') print('Please check the metabolite names') metfoundingraph = False break if metfoundingraph: folder_to_create = 'Results/' if not os.path.exists(folder_to_create): os.makedirs(folder_to_create) for currenttarmet in targetmetabolites: # multiple target mets for cutoff in cutoff_list: # multiple cutoffs pathway_table, cyclic_pathways, scope = find_pathways( G, seed_metabolites, int(cutoff)) print_summary(scope, currenttarmet, pathway_table, cutoff, cyclic_pathways, namemap, source_metabolites, seed_metabolites, number_of_xml, G) write_output_to_file(pathway_table, currenttarmet, cutoff, cyclic_pathways, folder_to_create, namemap, source_metabolites, G) print('\n') os.chdir('../') else: print('Folder with data files not found') print('Please follow the prescribed folder format') except: print('Path name is incorrect. Please check the path name')